Skip to content

Commit

Permalink
Merge pull request #3120 from esl/host-type-metrics
Browse files Browse the repository at this point in the history
Metrics per host type
  • Loading branch information
chrzaszcz authored May 31, 2021
2 parents 2a0d26a + 25f0555 commit cf515ef
Show file tree
Hide file tree
Showing 13 changed files with 361 additions and 323 deletions.
38 changes: 20 additions & 18 deletions big_tests/tests/metrics_api_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,11 @@ metrics_only_global(_Config) ->
Opts = string:split(V, ", ", all),
?assertEqual([<<"GET">>,<<"HEAD">>,<<"OPTIONS">>], lists:sort(Opts)),

% List of hosts and metrics
% List of host types and metrics
Res2 = simple_request(<<"GET">>, "/metrics/", Port),
{_S2, _H2, B2} = Res2,
assert_status(200, Res2),
#{<<"hosts">> := [_ExampleHost | _],
#{<<"host_types">> := [_ExampleHostType | _],
<<"metrics">> := [],
<<"global">> := [ExampleGlobal | _]} = B2,

Expand Down Expand Up @@ -282,11 +282,11 @@ metrics_msg_flow(_Config) ->
Opts = string:split(V, ", ", all),
?assertEqual([<<"GET">>,<<"HEAD">>,<<"OPTIONS">>], lists:sort(Opts)),

% List of hosts and metrics
% List of host types and metrics
Res2 = simple_request(<<"GET">>, "/metrics/", ?PORT),
{_S2, _H2, B2} = Res2,
assert_status(200, Res2),
#{<<"hosts">> := [ExampleHost | _],
#{<<"host_types">> := [ExampleHostType | _],
<<"metrics">> := [ExampleMetric | _],
<<"global">> := [ExampleGlobal | _]} = B2,

Expand All @@ -310,31 +310,31 @@ metrics_msg_flow(_Config) ->
Res5 = simple_request(<<"GET">>, "/metrics/all/nonExistentMetric", ?PORT),
assert_status(404, Res5),

% All metrics for an example host
% All metrics for an example host type
Res6 = simple_request(<<"GET">>,
unicode:characters_to_list(["/metrics/host/", ExampleHost]),
unicode:characters_to_list(["/metrics/host_type/", ExampleHostType]),
?PORT),
{_S6, _H6, B6} = Res6,
#{<<"metrics">> := _} = B6,
?assertEqual(1, maps:size(B6)),

% Negative case for a non-existent host
Res7 = simple_request(<<"GET">>, "/metrics/host/nonExistentHost", ?PORT),
% Negative case for a non-existent host type
Res7 = simple_request(<<"GET">>, "/metrics/host_type/nonExistentHostType", ?PORT),
assert_status(404, Res7),

% An example metric for an example host
% An example metric for an example host type
Res8 = simple_request(<<"GET">>,
unicode:characters_to_list(["/metrics/host/", ExampleHost,
unicode:characters_to_list(["/metrics/host_type/", ExampleHostType,
"/", ExampleMetric]),
?PORT),
{_S8, _H8, B8} = Res8,
#{<<"metric">> := #{<<"one">> := _, <<"count">> := _} = IM2} = B8,
?assertEqual(2, maps:size(IM2)),
?assertEqual(1, maps:size(B8)),

% Negative case for a non-existent (host, metric) pair
% Negative case for a non-existent (host type, metric) pair
Res9 = simple_request(<<"GET">>,
unicode:characters_to_list(["/metrics/host/", ExampleHost,
unicode:characters_to_list(["/metrics/host_type/", ExampleHostType,
"/nonExistentMetric"]),
?PORT),
assert_status(404, Res9),
Expand Down Expand Up @@ -384,21 +384,23 @@ find(CounterName, CounterList) ->

fetch_counter_value(Counter, _Config) ->
Metric = atom_to_binary(Counter, utf8),
Host = ct:get_config({hosts, mim, domain}),

HostType = domain_helper:host_type(mim),
HostTypeName = metrics_helper:make_host_type_name(HostType),

Result = simple_request(<<"GET">>,
unicode:characters_to_list(["/metrics/host/", Host, "/", Metric]),
unicode:characters_to_list(["/metrics/host_type/", HostTypeName, "/", Metric]),
?PORT),
{_S, _H, B} = Result,
assert_status(200, Result),
#{<<"metric">> := #{<<"count">> := HostValue}} = B,
#{<<"metric">> := #{<<"count">> := HostTypeValue}} = B,

Result2 = simple_request(<<"GET">>,
unicode:characters_to_list(["/metrics/host/", Host]),
unicode:characters_to_list(["/metrics/host_type/", HostTypeName]),
?PORT),
{_S2, _H2, B2} = Result2,
assert_status(200, Result2),
#{<<"metrics">> := #{Metric := #{<<"count">> := HostValueList}}} = B2,
#{<<"metrics">> := #{Metric := #{<<"count">> := HostTypeValueList}}} = B2,

Result3 = simple_request(<<"GET">>,
unicode:characters_to_list(["/metrics/all/", Metric]),
Expand All @@ -412,7 +414,7 @@ fetch_counter_value(Counter, _Config) ->
assert_status(200, Result4),
#{<<"metrics">> := #{Metric := #{<<"count">> := TotalValueList}}} = B4,

[HostValue, HostValueList, TotalValue, TotalValueList].
[HostTypeValue, HostTypeValueList, TotalValue, TotalValueList].

%% @doc Fetch counter that is static.
fetch_global_gauge_value(Counter, Config) ->
Expand Down
27 changes: 17 additions & 10 deletions big_tests/tests/metrics_helper.erl
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
-define(ONLY_GLOBAL_METRICS_GROUP_USERS, [clusterguy, clusterbuddy]).

get_counter_value(CounterName) ->
get_counter_value(ct:get_config({hosts, mim, domain}), CounterName).
get_counter_value(domain_helper:host_type(mim), CounterName).

get_counter_value(Host, Metric) ->
case rpc(mim(), mongoose_metrics, get_metric_value, [Host, Metric]) of
get_counter_value(HostType, Metric) ->
HostTypeName = make_host_type_name(HostType),
case rpc(mim(), mongoose_metrics, get_metric_value, [HostTypeName, Metric]) of
{ok, [{count, Total}, {one, _}]} ->
{value, Total};
{ok, [{value, Value} | _]} when is_integer(Value) ->
Expand All @@ -27,10 +28,11 @@ get_counter_value(Host, Metric) ->
end.

assert_counter(Value, CounterName) ->
assert_counter(ct:get_config({hosts, mim, domain}), Value, CounterName).
assert_counter(domain_helper:host_type(mim), Value, CounterName).

assert_counter(Host, Value, CounterName) ->
{value, Value} = get_counter_value(Host, CounterName).
assert_counter(HostType, Value, CounterName) ->
HostTypeName = make_host_type_name(HostType),
{value, Value} = get_counter_value(HostTypeName, CounterName).

-spec prepare_by_all_metrics_are_global(Config :: list(), UseAllMetricsAreGlobal :: boolean()) ->
list().
Expand Down Expand Up @@ -88,11 +90,16 @@ user_ids(Config) ->
end.

wait_for_counter(ExpectedValue, Counter) ->
wait_for_counter(ct:get_config({hosts, mim, domain}), ExpectedValue, Counter).
wait_for_counter(domain_helper:host_type(mim), ExpectedValue, Counter).

wait_for_counter(Host, ExpectedValue, Counter) ->
mongoose_helper:wait_until(fun() ->
assert_counter(Host, ExpectedValue, Counter)
wait_for_counter(HostType, ExpectedValue, Counter) ->
mongoose_helper:wait_until(fun() ->
assert_counter(HostType, ExpectedValue, Counter)
end,
{value, ExpectedValue},
#{name => Counter, time_left => ?WAIT_TIME, sleep_time => 20}).

make_host_type_name(HT) when is_atom(HT) ->
HT;
make_host_type_name(HT) when is_binary(HT) ->
binary:replace(HT, <<" ">>, <<"_">>, [global]).
11 changes: 10 additions & 1 deletion doc/migrations/4.2.0_4.3.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,15 @@ GO

## Groupchat hook migrations

- `filter_room_packet` hook uses a map insted of a proplist
- `filter_room_packet` hook uses a map instead of a proplist
for the event data information.
- `room_send_packet` hook has been removed. Use `filter_room_packet` instead.

## Metrics REST API (obsolete)

The API is still considered obsolete so if you are using it,
please consider using [WombatOAM](https://www.erlang-solutions.com/capabilities/wombatoam/)
or metrics reporters as described in [Logging and monitoring](../operation-and-maintenance/Logging-&-monitoring.md).

In each endpoint, `host` has been changed to `host_type`.
This is because the metrics are now collected per host type rather than host.
120 changes: 60 additions & 60 deletions doc/operation-and-maintenance/MongooseIM-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ MongooseIM uses [ESL's fork of this project](https://github.com/esl/exometer/tre

All metrics are divided into the following groups:

* Per host metrics: Gathered separately for every XMPP host supported by the cluster.
**Warning:** If a cluster supports many (thousands or more) domains, performance issues might occur.
* Per host type metrics: Gathered separately for every host type supported by the cluster.
**Warning:** If a cluster supports many (thousands or more) host types, performance issues might occur.
To avoid this, use global equivalents of the metrics with `all_metrics_are_global` config option.
* Hook metrics.
They are created for every [hook](../developers-guide/Hooks-and-handlers.md) and incremented on every call to it.
* Global metrics: Metrics common for all XMPP hosts.
* Global metrics: Metrics common for all host types.
* Data metrics.
These are misc. metrics related to data transfers (e.g. sent and received stanza size statistics).
* VM metrics. Basic Erlang VM statistics.
Expand Down Expand Up @@ -60,7 +60,7 @@ A histogram collects values over a sliding window of 60s and exposes the followi
* `median`
* `50`, `75`, `90`, `95`, `99`, `999` - 50th, 75th, 90th, 95th, 99th and 99.9th percentile

## Per host metrics
## Per host type metrics

### Hook metrics

Expand All @@ -69,72 +69,72 @@ As a result it makes more sense to maintain a list of the most relevant or usefu

| Name | Type | Description (when it gets incremented) |
| ---- | ---- | -------------------------------------- |
| `[Host, anonymous_purge_hook]` | spiral | An anonymous user disconnects. |
| `[Host, c2s_unauthenticated_iq]` | spiral | An IQ sent from a user to a server without authentication. |
| `[Host, disco_info]` | spiral | An information about the server has been requested via Disco protocol. |
| `[Host, disco_local_features]` | spiral | A list of server features is gathered. |
| `[Host, disco_local_identity]` | spiral | A list of server identities is gathered. |
| `[Host, disco_local_items]` | spiral | A list of server's items (e.g. services) is gathered. |
| `[Host, disco_sm_features]` | spiral | A list of user's features is gathered. |
| `[Host, disco_sm_identity]` | spiral | A list of user's identities is gathered. |
| `[Host, disco_sm_items]` | spiral | A list of user's items is gathered. |
| `[Host, mam_lookup_messages]` | spiral | An archive lookup is performed. |
| `[Host, offline_message_hook]` | spiral | A message was sent to an offline user. (Except for "error", "headline" and "groupchat" message types.) |
| `[Host, offline_groupchat_message_hook]` | spiral | A groupchat message was sent to an offline user. |
| `[Host, privacy_updated_list]` | spiral | User's privacy list is updated. |
| `[Host, resend_offline_messages_hook]` | spiral | A list of offline messages is gathered for delivery to a user's new connection. |
| `[Host, roster_get_subscription_lists]` | spiral | Presence subscription lists (based on which presence updates are broadcasted) are gathered. |
| `[Host, roster_in_subscription]` | spiral | A presence with subscription update is processed. |
| `[Host, roster_out_subscription]` | spiral | A presence with subscription update is received from a client. |
| `[Host, sm_broadcast]` | spiral | A stanza is broadcasted to all of user's resources. |
| `[Host, unset_presence_hook]` | spiral | A user disconnects or sends an `unavailable` presence. |
| `[HostType, anonymous_purge_hook]` | spiral | An anonymous user disconnects. |
| `[HostType, c2s_unauthenticated_iq]` | spiral | An IQ sent from a user to a server without authentication. |
| `[HostType, disco_info]` | spiral | An information about the server has been requested via Disco protocol. |
| `[HostType, disco_local_features]` | spiral | A list of server features is gathered. |
| `[HostType, disco_local_identity]` | spiral | A list of server identities is gathered. |
| `[HostType, disco_local_items]` | spiral | A list of server's items (e.g. services) is gathered. |
| `[HostType, disco_sm_features]` | spiral | A list of user's features is gathered. |
| `[HostType, disco_sm_identity]` | spiral | A list of user's identities is gathered. |
| `[HostType, disco_sm_items]` | spiral | A list of user's items is gathered. |
| `[HostType, mam_lookup_messages]` | spiral | An archive lookup is performed. |
| `[HostType, offline_message_hook]` | spiral | A message was sent to an offline user. (Except for "error", "headline" and "groupchat" message types.) |
| `[HostType, offline_groupchat_message_hook]` | spiral | A groupchat message was sent to an offline user. |
| `[HostType, privacy_updated_list]` | spiral | User's privacy list is updated. |
| `[HostType, resend_offline_messages_hook]` | spiral | A list of offline messages is gathered for delivery to a user's new connection. |
| `[HostType, roster_get_subscription_lists]` | spiral | Presence subscription lists (based on which presence updates are broadcasted) are gathered. |
| `[HostType, roster_in_subscription]` | spiral | A presence with subscription update is processed. |
| `[HostType, roster_out_subscription]` | spiral | A presence with subscription update is received from a client. |
| `[HostType, sm_broadcast]` | spiral | A stanza is broadcasted to all of user's resources. |
| `[HostType, unset_presence_hook]` | spiral | A user disconnects or sends an `unavailable` presence. |

### Presences & rosters

| Name | Type | Description (when it gets incremented) |
| ---- | ---- | -------------------------------------- |
| `[Host, modPresenceSubscriptions]` | spiral | Presence subscription is processed. |
| `[Host, modPresenceUnsubscriptions]` | spiral | Presence unsubscription is processed. |
| `[Host, modRosterGets]` | spiral | User's roster is fetched. |
| `[Host, modRosterPush]` | spiral | A roster update is pushed to a single session. |
| `[Host, modRosterSets]` | spiral | User's roster is updated. |
| `[HostType, modPresenceSubscriptions]` | spiral | Presence subscription is processed. |
| `[HostType, modPresenceUnsubscriptions]` | spiral | Presence unsubscription is processed. |
| `[HostType, modRosterGets]` | spiral | User's roster is fetched. |
| `[HostType, modRosterPush]` | spiral | A roster update is pushed to a single session. |
| `[HostType, modRosterSets]` | spiral | User's roster is updated. |

### Privacy lists

| Name | Type | Description (when it gets incremented) |
| ---- | ---- | -------------------------------------- |
| `[Host, modPrivacyGets]` | spiral | IQ privacy `get` is processed. |
| `[Host, modPrivacyPush]` | spiral | Privacy list update is sent to a single session. |
| `[Host, modPrivacySets]` | spiral | IQ privacy `set` is processed. |
| `[Host, modPrivacySetsActive]` | spiral | Active privacy list is changed. |
| `[Host, modPrivacySetsDefault]` | spiral | Default privacy list is changed. |
| `[Host, modPrivacyStanzaAll]` | spiral | A packet is checked against the privacy list. |
| `[Host, modPrivacyStanzaDenied]` | spiral | Privacy list check resulted in `deny`. |
| `[Host, modPrivacyStanzaBlocked]` | spiral | Privacy list check resulted in `block`. |
| `[HostType, modPrivacyGets]` | spiral | IQ privacy `get` is processed. |
| `[HostType, modPrivacyPush]` | spiral | Privacy list update is sent to a single session. |
| `[HostType, modPrivacySets]` | spiral | IQ privacy `set` is processed. |
| `[HostType, modPrivacySetsActive]` | spiral | Active privacy list is changed. |
| `[HostType, modPrivacySetsDefault]` | spiral | Default privacy list is changed. |
| `[HostType, modPrivacyStanzaAll]` | spiral | A packet is checked against the privacy list. |
| `[HostType, modPrivacyStanzaDenied]` | spiral | Privacy list check resulted in `deny`. |
| `[HostType, modPrivacyStanzaBlocked]` | spiral | Privacy list check resulted in `block`. |

### Other

| Name | Type | Description (when it gets incremented) |
| ---- | ---- | -------------------------------------- |
| `[Host, sessionAuthFails]` | spiral | A client failed to authenticate. |
| `[Host, sessionCount]` | counter | Number of active sessions. |
| `[Host, sessionLogouts]` | spiral | A client session is closed. |
| `[Host, sessionSuccessfulLogins]` | spiral | A client session is opened. |
| `[Host, xmppErrorIq]` | spiral | An `error` IQ is sent to a client. |
| `[Host, xmppErrorMessage]` | spiral | An `error` message is sent to a client. |
| `[Host, xmppErrorPresence]` | spiral | An `error` presence is sent to a client. |
| `[Host, xmppErrorTotal]` | spiral | A stanza with `error` type is routed. |
| `[Host, xmppMessageBounced]` | spiral | A `service-unavailable` error is sent, because the message recipient if offline. |
| `[Host, xmppIqSent]` | spiral | An IQ is sent by a client. |
| `[Host, xmppMessageSent]` | spiral | A message is sent by a client |
| `[Host, xmppPresenceSent]` | spiral | A presence is sent by a client. |
| `[Host, xmppStanzaSent]` | spiral | A stanza is sent by a client. |
| `[Host, xmppIqReceived]` | spiral | An IQ is sent to a client. |
| `[Host, xmppMessageReceived]` | spiral | A message is sent to a client. |
| `[Host, xmppPresenceReceived]` | spiral | A presence is sent to a client. |
| `[Host, xmppStanzaReceived]` | spiral | A stanza is sent to a client. |
| `[Host, xmppStanzaCount]` | spiral | A stanza is sent to a client. |
| `[Host, xmppStanzaDropped]` | spiral | A stanza is dropped due to an AMP rule or a `filter_packet` processing flow. |
| `[HostType, sessionAuthFails]` | spiral | A client failed to authenticate. |
| `[HostType, sessionCount]` | counter | Number of active sessions. |
| `[HostType, sessionLogouts]` | spiral | A client session is closed. |
| `[HostType, sessionSuccessfulLogins]` | spiral | A client session is opened. |
| `[HostType, xmppErrorIq]` | spiral | An `error` IQ is sent to a client. |
| `[HostType, xmppErrorMessage]` | spiral | An `error` message is sent to a client. |
| `[HostType, xmppErrorPresence]` | spiral | An `error` presence is sent to a client. |
| `[HostType, xmppErrorTotal]` | spiral | A stanza with `error` type is routed. |
| `[HostType, xmppMessageBounced]` | spiral | A `service-unavailable` error is sent, because the message recipient if offline. |
| `[HostType, xmppIqSent]` | spiral | An IQ is sent by a client. |
| `[HostType, xmppMessageSent]` | spiral | A message is sent by a client |
| `[HostType, xmppPresenceSent]` | spiral | A presence is sent by a client. |
| `[HostType, xmppStanzaSent]` | spiral | A stanza is sent by a client. |
| `[HostType, xmppIqReceived]` | spiral | An IQ is sent to a client. |
| `[HostType, xmppMessageReceived]` | spiral | A message is sent to a client. |
| `[HostType, xmppPresenceReceived]` | spiral | A presence is sent to a client. |
| `[HostType, xmppStanzaReceived]` | spiral | A stanza is sent to a client. |
| `[HostType, xmppStanzaCount]` | spiral | A stanza is sent to a client. |
| `[HostType, xmppStanzaDropped]` | spiral | A stanza is dropped due to an AMP rule or a `filter_packet` processing flow. |

### Extension-specific metrics

Expand Down Expand Up @@ -185,16 +185,16 @@ The latter is a number of calls (spiral metric), incremented for *every* call (e

Besides these, following authentication metrics are always available:

* `[Host, backends, auth, authorize]`
* `[Host, backends, auth, check_password]`
* `[Host, backends, auth, try_register]`
* `[Host, backends, auth, does_user_exist]`
* `[HostType, backends, auth, authorize]`
* `[HostType, backends, auth, check_password]`
* `[HostType, backends, auth, try_register]`
* `[HostType, backends, auth, does_user_exist]`

These are **total** times of respective operations.
One operation usually requires only a single call to an auth backend but sometimes with e.g. 3 backends configured, the operation may fail for first 2 backends.
In such case, these metrics will be updated with combined time of 2 failed and 1 successful request.

Additionally, the RDBMS layer in MongooseIM exposes two more metrics, if RDBMS is configured:

* `[global, backends, mongoose_rdbms, query]` - Execution time of a "simple"" (not prepared) query by a DB driver.
* `[global, backends, mongoose_rdbms, query]` - Execution time of a "simple" (not prepared) query by a DB driver.
* `[global, backends, mongoose_rdbms, execute]` - Execution time of a prepared query by a DB driver.
Loading

0 comments on commit cf515ef

Please sign in to comment.