From e07f1a2daab01c51e452a8cd1abf1f62e5dfcd59 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Fri, 17 Aug 2018 17:18:48 -0700 Subject: [PATCH] etcdserver/api/rafthttp: add "etcd_network_raft_send_total_duration_seconds" metric Currently, only v2 metrics ("stats.FollowerStats") tracks Raft message send latencies. Add Prometheus histogram to track Raft messages for writes, since heartbeats are probed (see https://github.com/coreos/etcd/pull/10022) and snapshots are already being tracked via https://github.com/coreos/etcd/pull/9997. ``` etcd_network_raft_send_total_duration_seconds_bucket{To="7339c4e5e833c029",Type="MsgProp",le="0.0001"} 1 etcd_network_raft_send_total_duration_seconds_bucket{To="7339c4e5e833c029",Type="MsgProp",le="0.0002"} 1 etcd_network_raft_send_total_duration_seconds_bucket{To="729934363faa4a24",Type="MsgApp",le="0.0001"} 9 etcd_network_raft_send_total_duration_seconds_bucket{To="729934363faa4a24",Type="MsgApp",le="0.0002"} 9 etcd_network_raft_send_total_duration_seconds_bucket{To="7339c4e5e833c029",Type="MsgAppResp",le="0.0001"} 8 etcd_network_raft_send_total_duration_seconds_bucket{To="7339c4e5e833c029",Type="MsgAppResp",le="0.0002"} 8 ``` Signed-off-by: Gyuho Lee --- etcdserver/api/rafthttp/metrics.go | 14 ++++++++++++++ etcdserver/api/rafthttp/stream.go | 10 ++++++++++ 2 files changed, 24 insertions(+) diff --git a/etcdserver/api/rafthttp/metrics.go b/etcdserver/api/rafthttp/metrics.go index 5f862e9decc..182ee4a5648 100644 --- a/etcdserver/api/rafthttp/metrics.go +++ b/etcdserver/api/rafthttp/metrics.go @@ -133,6 +133,19 @@ var ( []string{"From"}, ) + raftSendSeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "etcd", + Subsystem: "network", + Name: "raft_send_total_duration_seconds", + Help: "Total latency distributions of Raft message sends", + + // lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2 + // highest bucket start of 0.0001 sec * 2^15 == 3.2768 sec + Buckets: prometheus.ExponentialBuckets(0.0001, 2, 16), + }, + []string{"Type", "To"}, + ) + rttSec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "network", @@ -162,5 +175,6 @@ func init() { prometheus.MustRegister(snapshotReceiveFailures) prometheus.MustRegister(snapshotReceiveSeconds) + prometheus.MustRegister(raftSendSeconds) prometheus.MustRegister(rttSec) } diff --git a/etcdserver/api/rafthttp/stream.go b/etcdserver/api/rafthttp/stream.go index dcb2223ca59..7b20aff96a9 100644 --- a/etcdserver/api/rafthttp/stream.go +++ b/etcdserver/api/rafthttp/stream.go @@ -201,8 +201,10 @@ func (cw *streamWriter) run() { heartbeatc, msgc = nil, nil case m := <-msgc: + start := time.Now() err := enc.encode(&m) if err == nil { + took := time.Since(start) unflushed += m.Size() if len(msgc) == 0 || batched > streamBufSize/2 { @@ -214,6 +216,14 @@ func (cw *streamWriter) run() { batched++ } + // snapshot sends are tracked via separate metrics https://github.com/etcd-io/etcd/pull/9997 + // heartbeats are tracked via prober https://github.com/etcd-io/etcd/pull/10022 + // TODO: track other messages? + if m.Type == raftpb.MsgProp || + m.Type == raftpb.MsgApp || + m.Type == raftpb.MsgAppResp { + raftSendSeconds.WithLabelValues(m.Type.String(), types.ID(m.To).String()).Observe(took.Seconds()) + } continue }