Skip to content

Commit

Permalink
Improve metrics descriptions and naming (#2409)
Browse files Browse the repository at this point in the history
* Improved naming and description of some telemetry metrics. Changed configuration 'tx_confirmation' default to be false

* Added changelog entry

* Updated guide with new telemetry changes

* Renamed clear packet metrics

* Updated some metric descriptions and conditional display

* Fixed metric used to count cleared acknowledgments, updated latencies histogram buckets.

* Clippy fix

* Added new enum to TrackindId in order to track latency of packets cleared

* Fixes in comments

* Improved changelog entry, fixed description of wallet balance metric and updated telemetry output example in Hermes guide
  • Loading branch information
ljoss17 authored Jul 21, 2022
1 parent c10bf7c commit 53f3873
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 156 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- Improved naming and description of some telemetry metrics and added
histogram buckets for `tx_latency` metrics.
In addition the default value of the configuration `tx_confirmation`
in Hermes `config.toml` has been changed from `true` to `false`.
([#2408](https://github.com/informalsystems/ibc-rs/issues/2408))
7 changes: 4 additions & 3 deletions config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ clear_on_start = true
# (to any chain in this config file) have been successfully delivered.
# If they have not been, and `clear_interval = 0`, then those packets are
# queued up for re-submission.
# Experimental feature. Affects telemetry if set to false.
# [Default: true]
tx_confirmation = true
# Experimental feature. If set to `false`, the following telemetry metrics will be disabled:
# `ibc_acknowledgment_packets`, `ibc_receive_packets` and `ibc_timeout_packets`.
# [Default: false]
tx_confirmation = false

# The REST section defines parameters for Hermes' built-in RESTful API.
# https://hermes.informal.systems/rest.html
Expand Down
210 changes: 105 additions & 105 deletions guide/src/telemetry.md

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions relayer/src/chain/tracking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ pub enum TrackingId {
/// corresponding event batch, eg. when performing actions from
/// the CLI or during packet clearing.
Static(&'static str),
/// Random identifier used to track latency of packet clearing.
ClearedUuid(Uuid),
}

impl TrackingId {
Expand All @@ -26,6 +28,10 @@ impl TrackingId {
pub fn new_static(s: &'static str) -> Self {
Self::Static(s)
}

pub fn new_cleared_uuid() -> Self {
Self::ClearedUuid(Uuid::new_v4())
}
}

impl fmt::Display for TrackingId {
Expand All @@ -37,6 +43,13 @@ impl fmt::Display for TrackingId {
s.fmt(f)
}
TrackingId::Static(s) => s.fmt(f),
TrackingId::ClearedUuid(u) => {
let mut uuid = "cleared/".to_owned();
let mut s = u.to_string();
s.truncate(8);
uuid.push_str(&s);
uuid.fmt(f)
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions relayer/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pub mod default {
}

pub fn tx_confirmation() -> bool {
true
false
}

pub fn clear_packets_interval() -> u64 {
Expand Down Expand Up @@ -155,7 +155,7 @@ impl Default for ModeConfig {
enabled: true,
clear_interval: default::clear_packets_interval(),
clear_on_start: true,
tx_confirmation: true,
tx_confirmation: false,
},
}
}
Expand Down
55 changes: 30 additions & 25 deletions relayer/src/link/relay_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,8 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
}

fn relay_pending_packets(&self, height: Option<Height>) -> Result<(), LinkError> {
let tracking_id = TrackingId::new_static("relay pending packets");
let tracking_id = TrackingId::new_cleared_uuid();
telemetry!(received_event_batch, tracking_id);

for i in 1..=MAX_RETRIES {
let cleared = self
Expand Down Expand Up @@ -1090,7 +1091,7 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
// Update telemetry info
telemetry!({
for e in events_chunk.clone() {
self.record_cleared_send_packet_and_acknowledgment(e);
self.record_cleared_send_packet(e);
}
});
self.events_to_operational_data(TrackedEvents::new(events_chunk, tracking_id))?;
Expand Down Expand Up @@ -1137,6 +1138,7 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
&self.path_id,
query_write_ack_events,
) {
telemetry!(self.record_cleared_acknowledgments(events_chunk.clone()));
self.events_to_operational_data(TrackedEvents::new(events_chunk, tracking_id))?;
}

Expand Down Expand Up @@ -1762,28 +1764,32 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
}

#[cfg(feature = "telemetry")]
fn record_cleared_send_packet_and_acknowledgment(&self, event: IbcEvent) {
match event {
IbcEvent::SendPacket(send_packet_ev) => {
ibc_telemetry::global().send_packet_count(
send_packet_ev.packet.sequence.into(),
send_packet_ev.height().revision_height(),
&self.src_chain().id(),
self.src_channel_id(),
self.src_port_id(),
&self.dst_chain().id(),
);
ibc_telemetry::global().cleared_count(
send_packet_ev.packet.sequence.into(),
send_packet_ev.height().revision_height(),
&self.src_chain().id(),
self.src_channel_id(),
self.src_port_id(),
&self.dst_chain().id(),
);
}
IbcEvent::WriteAcknowledgement(write_ack_ev) => {
ibc_telemetry::global().acknowledgement_count(
fn record_cleared_send_packet(&self, event: IbcEvent) {
if let IbcEvent::SendPacket(send_packet_ev) = event {
ibc_telemetry::global().send_packet_count(
send_packet_ev.packet.sequence.into(),
send_packet_ev.height().revision_height(),
&self.src_chain().id(),
self.src_channel_id(),
self.src_port_id(),
&self.dst_chain().id(),
);
ibc_telemetry::global().clear_send_packet_count(
send_packet_ev.packet.sequence.into(),
send_packet_ev.height().revision_height(),
&self.src_chain().id(),
self.src_channel_id(),
self.src_port_id(),
&self.dst_chain().id(),
);
}
}

#[cfg(feature = "telemetry")]
fn record_cleared_acknowledgments(&self, events: Vec<IbcEvent>) {
for e in events {
if let IbcEvent::WriteAcknowledgement(write_ack_ev) = e {
ibc_telemetry::global().clear_acknowledgment_packet_count(
write_ack_ev.packet.sequence.into(),
write_ack_ev.height().revision_height(),
&self.dst_chain().id(),
Expand All @@ -1792,7 +1798,6 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
&self.src_chain().id(),
);
}
_ => {}
}
}
}
79 changes: 58 additions & 21 deletions telemetry/src/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,11 @@ pub struct TelemetryState {
/// Counts the number of WriteAcknowledgement Hermes transfers.
acknowledgement_count: Counter<u64>,

/// Counts the number of SendPacket Hermes transfers from ClearPacket.
cleared_count: Counter<u64>,
/// Counts the number of SendPacket events Hermes processes from ClearPendingPackets.
cleared_send_packet_count: Counter<u64>,

/// Counts the number of WriteAcknowledgment events Hermes processes from ClearPendingPackets.
cleared_acknowledgment_count: Counter<u64>,

/// Records the sequence number of the oldest SendPacket for which no
/// WriteAcknowledgement has been received. The value is 0 if all the
Expand Down Expand Up @@ -162,7 +165,9 @@ impl TelemetryState {
KeyValue::new("src_port", src_port.to_string()),
];

self.receive_packets.add(count, labels);
if count > 0 {
self.receive_packets.add(count, labels);
}
}

/// Number of acknowledgment packets relayed, per channel
Expand All @@ -179,7 +184,9 @@ impl TelemetryState {
KeyValue::new("src_port", src_port.to_string()),
];

self.acknowledgment_packets.add(count, labels);
if count > 0 {
self.acknowledgment_packets.add(count, labels);
}
}

/// Number of timeout packets relayed, per channel
Expand All @@ -196,7 +203,9 @@ impl TelemetryState {
KeyValue::new("src_port", src_port.to_string()),
];

self.timeout_packets.add(count, labels);
if count > 0 {
self.timeout_packets.add(count, labels);
}
}

/// Number of queries emitted by the relayer, per chain and query type
Expand Down Expand Up @@ -351,7 +360,7 @@ impl TelemetryState {
self.acknowledgement_count.add(1, labels);
}

pub fn cleared_count(
pub fn clear_send_packet_count(
&self,
_seq_nr: u64,
_height: u64,
Expand All @@ -367,7 +376,26 @@ impl TelemetryState {
KeyValue::new("port", port_id.to_string()),
];

self.cleared_count.add(1, labels);
self.cleared_send_packet_count.add(1, labels);
}

pub fn clear_acknowledgment_packet_count(
&self,
_seq_nr: u64,
_height: u64,
chain_id: &ChainId,
channel_id: &ChannelId,
port_id: &PortId,
counterparty_chain_id: &ChainId,
) {
let labels: &[KeyValue; 4] = &[
KeyValue::new("chain", chain_id.to_string()),
KeyValue::new("counterparty", counterparty_chain_id.to_string()),
KeyValue::new("channel", channel_id.to_string()),
KeyValue::new("port", port_id.to_string()),
];

self.cleared_acknowledgment_count.add(1, labels);
}

pub fn record_send_history(
Expand Down Expand Up @@ -493,9 +521,14 @@ impl AggregatorSelector for CustomAggregatorSelector {
// Prometheus' supports only collector for histogram, sum, and last value aggregators.
// https://docs.rs/opentelemetry-prometheus/0.10.0/src/opentelemetry_prometheus/lib.rs.html#411-418
// TODO: Once quantile sketches are supported, replace histograms with that.
// For the moment, disable histogram buckets since no values make sense for all use-cases.
"tx_latency_submitted" => Some(Arc::new(histogram(descriptor, &[]))),
"tx_latency_confirmed" => Some(Arc::new(histogram(descriptor, &[]))),
"tx_latency_submitted" => Some(Arc::new(histogram(
descriptor,
&[200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0],
))),
"tx_latency_confirmed" => Some(Arc::new(histogram(
descriptor,
&[1000.0, 5000.0, 9000.0, 13000.0, 17000.0, 20000.0],
))),
_ => Some(Arc::new(sum())),
}
}
Expand Down Expand Up @@ -529,17 +562,17 @@ impl Default for TelemetryState {

receive_packets: meter
.u64_counter("ibc_receive_packets")
.with_description("Number of receive packets relayed per channel")
.with_description("Number of confirmed receive packets relayed per channel. Available if relayer runs with Tx confirmation enabled")
.init(),

acknowledgment_packets: meter
.u64_counter("ibc_acknowledgment_packets")
.with_description("Number of acknowledgment packets relayed per channel")
.with_description("Number of confirmed acknowledgment packets relayed per channel. Available if relayer runs with Tx confirmation enabled")
.init(),

timeout_packets: meter
.u64_counter("ibc_timeout_packets")
.with_description("Number of timeout packets relayed per channel")
.with_description("Number of confirmed timeout packets relayed per channel. Available if relayer runs with Tx confirmation enabled")
.init(),

queries: meter
Expand Down Expand Up @@ -571,23 +604,27 @@ impl Default for TelemetryState {

wallet_balance: meter
.f64_value_recorder("wallet_balance")
.with_description("The balance of each wallet Hermes uses per chain. Please note that when converting the balance to f64 a loss in
precision might be introduced in the displayed value")
.with_description("The balance of each wallet Hermes uses per chain. Please note that when converting the balance to f64 a loss in precision might be introduced in the displayed value")
.init(),

send_packet_count: meter
.u64_counter("send_packet_count")
.with_description("Number of SendPacket relayed")
.with_description("Number of SendPacket events processed")
.init(),

acknowledgement_count: meter
.u64_counter("acknowledgement_count")
.with_description("Number of WriteAcknowledgement relayed")
.with_description("Number of WriteAcknowledgement events processed")
.init(),

cleared_send_packet_count: meter
.u64_counter("cleared_send_packet_count")
.with_description("Number of SendPacket events processed during the initial and periodic clearing")
.init(),

cleared_count: meter
.u64_counter("cleared_count")
.with_description("Number of SendPacket relayed through ClearPendingPackets")
cleared_acknowledgment_count: meter
.u64_counter("cleared_acknowledgment_count")
.with_description("Number of WriteAcknowledgment events processed during the initial and periodic clearing")
.init(),

tx_latency_submitted: meter
Expand Down Expand Up @@ -615,7 +652,7 @@ impl Default for TelemetryState {

oldest_sequence: meter
.u64_value_recorder("oldest_sequence")
.with_description("The sequence number of the oldest pending SendPacket. If this value is 0, it means there are no pending SendPacket")
.with_description("The sequence number of the oldest SendPacket event observed without its corresponding WriteAcknowledgement event. If this value is 0, it means Hermes observed a WriteAcknowledgment event for all the SendPacket events")
.init(),

oldest_timestamp: meter
Expand Down

0 comments on commit 53f3873

Please sign in to comment.