From 5b145ab605338e9e302e5e944feb20e76c6ee1c0 Mon Sep 17 00:00:00 2001 From: Viacheslav Katsuba Date: Thu, 10 Dec 2020 15:19:15 +0200 Subject: [PATCH] Improve session termination reason handling (#271) * Improve session termination reason handling * rework the termination cause handling * Update METRICS.md * Update ergw_aaa to 3.6.4 Co-authored-by: Andreas Schultz --- METRICS.md | 26 ++++++++++++++++++-------- README.md | 10 ++++++++-- rebar.config | 2 +- rebar.lock | 4 ++-- src/ergw_gtp_gsn_lib.erl | 14 +++++++++----- src/ergw_prometheus.erl | 4 ++-- src/ggsn_gn.erl | 9 +++++---- src/gtp_context.erl | 33 +++++++++++++++++++++------------ src/pgw_s5s8.erl | 9 +++++---- src/saegw_s11.erl | 9 +++++---- src/tdf.erl | 20 ++++++++++++++------ test/pgw_SUITE.erl | 4 ++++ 12 files changed, 94 insertions(+), 50 deletions(-) diff --git a/METRICS.md b/METRICS.md index 7af62cc2..1cccd6f6 100644 --- a/METRICS.md +++ b/METRICS.md @@ -26,7 +26,7 @@ The following metrics exist: | gtp\_u\_socket\_messages\_processed\_total | counter | name, direction, version, type | Total number of GTP message processed on socket | | ergw\_local\_pool\_free | gauge | name, type, id | Number of free IPs | | ergw\_local\_pool\_used | gauge | name, type, id | Number of used IPs | -| termination\_cause\_total | counter | name, type | Total number of termination causes | +| termination\_cause\_total | counter | api, reason | Total number of termination causes | The label `name` is is taken from the configuration of the GTP socket and PeerIP is the IP address of the peer GSN. @@ -192,13 +192,23 @@ For GTPv2-C messages the following types exist: * version\_not\_supported The label `type` is the Termination Causes types. For Termination causes the following types exist: - * normal - * administrative - * link_broken - * upf_failure - * remote_failure - * inactivity_timeout - * peer_restart +| Reason | Description | +| --------------------- | ---------------- | +| normal | Normal session termination | +| administrative | The session is terminated by an administrative action (e.g. draining) | +| link_broken | A session message initiated by the ERGW to the SGW/SGSN is rejected | +| upf_failure | Communication between the ERGW and the UPF failed | +| remote_failure | In a proxy setup, communication to the remote PGW failed | +| peer_restart | The session is terminated because remote peer restart was detected one of the GTP peer connections the session is associated to | +| cp_inactivity_timeout | The session had no Control Plane (GTP-C) activity within the configured session inactivity timeout | +| up_inactivity_timeout | User plane has reported that the session had no user data transfer withing the configured session inactivity timeout | +| 'ASR' | One of the `AAA` interfaces (Gx, Gy, Ro) has disconnected the session | +| error | An unidentified error has been returned for an AAA request | +| req_timeout | An `AAA` request related to the session has timed out. :warning: **Note** : that this is only triggered when the `AAA` handler has no configured default answer for this error | +| conn_error | An `AAA` request related to the session failed because of no connection available. :warning: **Note** : that this is only triggered when the `AAA` handler has no configured default answer for this error | +| rate_limit | An `AAA` request related to the session failed because of rate limit reached towards the `AAA` interface instance. :warning: **Note** : that this is only triggered when the `AAA` handler has no configured default answer for this error | +| ocs_hold_end | The session was terminated because the OCS Hold duration given by the `AAA` Gy interface has expired | +| peer_reject | An `AAA` peer (e.g. OCS, PCRF) has sent an error result in the response to a request | The HTTP API exports the metrics in Prometheus format at `/metrics`: diff --git a/README.md b/README.md index 350285be..94146b00 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,10 @@ Example of configuration **RADIUS**: {link_broken, 2}, {upf_failure, 9}, {remote_failure, 9}, - {inactivity_timeout, 4}, + {cp_inactivity_timeout, 4}, + {up_inactivity_timeout, 4}, + {'ASR', 6}, + {error, 9}, {peer_restart, 7} ]} ]} @@ -167,7 +170,10 @@ Example of configuration **ergw-pgw-epc-rf** `function` of **DIAMETER**: {link_broken, 5}, {upf_failure, 5}, {remote_failure, 1}, - {inactivity_timeout, 1}, + {cp_inactivity_timeout, 4}, + {up_inactivity_timeout, 4}, + {'ASR', 6}, + {error, 9}, {peer_restart, 1} ]} %% ... diff --git a/rebar.config b/rebar.config index 315af1a7..d06fc72c 100644 --- a/rebar.config +++ b/rebar.config @@ -13,7 +13,7 @@ {netdata, {git, "https://github.com/RoadRunnr/erl_netdata.git", {ref, "cbd6eaf"}}}, {gtplib, {git, "https://github.com/travelping/gtplib.git", {branch, "master"}}}, {pfcplib, {git, "https://github.com/travelping/pfcplib.git", {branch, "master"}}}, - {ergw_aaa, {git, "git://github.com/travelping/ergw_aaa", {tag, "3.6.2"}}}, + {ergw_aaa, {git, "https://github.com/travelping/ergw_aaa.git", {tag, "3.6.4"}}}, {prometheus_diameter_collector, {git, "https://github.com/travelping/prometheus_diameter_collector.git", {tag, "1.0.1"}}} ]}. diff --git a/rebar.lock b/rebar.lock index c73c0fbd..e21a1fc8 100644 --- a/rebar.lock +++ b/rebar.lock @@ -7,8 +7,8 @@ {ref,"7147d879177f3a9ad88f909a12e41e1c565269b0"}}, 1}, {<<"ergw_aaa">>, - {git,"git://github.com/travelping/ergw_aaa", - {ref,"4a55ea63a783ec25dc9a6fed3d9a32f367c6697c"}}, + {git,"https://github.com/travelping/ergw_aaa.git", + {ref,"64f64872aadb185e54e637e6b9ba2bbdc0d2f6df"}}, 0}, {<<"erlando">>, {git,"https://github.com/travelping/erlando.git", diff --git a/src/ergw_gtp_gsn_lib.erl b/src/ergw_gtp_gsn_lib.erl index 2b113541..8254506e 100644 --- a/src/ergw_gtp_gsn_lib.erl +++ b/src/ergw_gtp_gsn_lib.erl @@ -11,7 +11,7 @@ {parse_transform, cut}]). -export([connect_upf_candidates/4, create_session/10]). --export([triggered_charging_event/4, usage_report/3, close_context/2]). +-export([triggered_charging_event/4, usage_report/3, close_context/3]). -export([update_tunnel_endpoint/3, handle_peer_change/3, update_tunnel_endpoint/2, apply_bearer_change/5]). @@ -224,10 +224,14 @@ triggered_charging_event(ChargeEv, Now, Request, usage_report(URRActions, UsageReport, #{pfcp := PCtx, 'Session' := Session}) -> ergw_gtp_gsn_session:usage_report(URRActions, UsageReport, PCtx, Session). -close_context(Reason, #{pfcp := PCtx, 'Session' := Session}) -> - UsageReport = ergw_pfcp_context:delete_session(Reason, PCtx), - ergw_gtp_gsn_session:close_context(Reason, UsageReport, PCtx, Session), - ergw_prometheus:termination_cause(?FUNCTION_NAME, Reason), +%% close_context/3 +close_context(_, {API, TermCause}, Context) -> + close_context(API, TermCause, Context); +close_context(API, TermCause, #{pfcp := PCtx, 'Session' := Session}) + when is_atom(TermCause) -> + UsageReport = ergw_pfcp_context:delete_session(TermCause, PCtx), + ergw_gtp_gsn_session:close_context(TermCause, UsageReport, PCtx, Session), + ergw_prometheus:termination_cause(API, TermCause), ok. %%==================================================================== diff --git a/src/ergw_prometheus.erl b/src/ergw_prometheus.erl index 214acdd7..06052657 100644 --- a/src/ergw_prometheus.erl +++ b/src/ergw_prometheus.erl @@ -110,8 +110,8 @@ declare() -> %% Termination cause metrics prometheus_counter:declare([{name, termination_cause_total}, - {labels, [name, type]}, - {help, "Total number of termination causes"}]), + {labels, [api, reason]}, + {help, "Total number of termination causes"}]), ok. %%%=================================================================== diff --git a/src/ggsn_gn.erl b/src/ggsn_gn.erl index a262a7a3..d68f9d99 100644 --- a/src/ggsn_gn.erl +++ b/src/ggsn_gn.erl @@ -41,6 +41,7 @@ -import(ergw_aaa_session, [to_session/1]). +-define(API, 'gn/gp'). -define(T3, 10 * 1000). -define(N3, 5). @@ -237,7 +238,7 @@ handle_request(ReqKey, handle_request(ReqKey, #gtp{type = delete_pdp_context_request, ie = _IEs} = Request, _Resent, _State, #{left_tunnel := LeftTunnel} = Data) -> - ergw_gtp_gsn_lib:close_context(normal, Data), + ergw_gtp_gsn_lib:close_context(?API, normal, Data), Response = response(delete_pdp_context_response, LeftTunnel, request_accepted), gtp_context:send_response(ReqKey, Request, Response), {next_state, shutdown, Data}; @@ -248,7 +249,7 @@ handle_request(ReqKey, _Msg, _Resent, _State, _Data) -> handle_response({From, TermCause}, timeout, #gtp{type = delete_pdp_context_request}, _State, Data) -> - ergw_gtp_gsn_lib:close_context(TermCause, Data), + ergw_gtp_gsn_lib:close_context(?API, TermCause, Data), if is_tuple(From) -> gen_statem:reply(From, {error, timeout}); true -> ok end, @@ -262,7 +263,7 @@ handle_response({From, TermCause}, LeftTunnel = gtp_path:bind(Response, LeftTunnel0), DataNew = Data#{left_tunnel := LeftTunnel}, - ergw_gtp_gsn_lib:close_context(TermCause, Data), + ergw_gtp_gsn_lib:close_context(?API, TermCause, Data), if is_tuple(From) -> gen_statem:reply(From, {ok, Cause}); true -> ok end, @@ -343,7 +344,7 @@ encode_eua(Org, Number, IPv4, IPv6) -> pdp_address = <>}. close_context(_Side, Reason, _State, Data) -> - ergw_gtp_gsn_lib:close_context(Reason, Data). + ergw_gtp_gsn_lib:close_context(?API, Reason, Data). map_attr('APN', #{?'Access Point Name' := #access_point_name{apn = APN}}) -> unicode:characters_to_binary(lists:join($., APN)); diff --git a/src/gtp_context.erl b/src/gtp_context.erl index 31ae898e..6d0821a6 100644 --- a/src/gtp_context.erl +++ b/src/gtp_context.erl @@ -115,11 +115,13 @@ tunnel_reg_update(TunnelOld, TunnelNew) -> Insert = ordsets:subtract(NewKeys, OldKeys), gtp_context_reg:update(Delete, Insert, ?MODULE, self()). +%% Used in tests only delete_context(Context) -> - gen_statem:call(Context, delete_context). + gen_statem:call(Context, {delete_context, normal}). +%% Trigger from admin API trigger_delete_context(Context) -> - gen_statem:cast(Context, delete_context). + gen_statem:cast(Context, {delete_context, administrative}). %% TODO: add online charing events collect_charging_events(OldS, NewS) -> @@ -395,7 +397,7 @@ handle_event({call, From}, {sx, #pfcp{type = session_report_request, ie = #{report_type := #report_type{upir = 1}}}}, State, #{pfcp := PCtx} = Data) -> - close_context(both, inactivity_timeout, State, Data), + close_context(both, up_inactivity_timeout, State, Data), {next_state, shutdown, Data, [{reply, From, {ok, PCtx}}]}; %% Usage Report @@ -443,9 +445,9 @@ handle_event(cast, {handle_response, ReqInfo, Request, Response0}, State, erlang:raise(Class, Reason, Stacktrace) end; -handle_event(info, #aaa_request{procedure = {_, 'ASR'}} = Request, State, Data) -> +handle_event(info, #aaa_request{procedure = {_, 'ASR'} = Procedure} = Request, State, Data) -> ergw_aaa_session:response(Request, ok, #{}, #{}), - delete_context(undefined, administrative, State, Data); + delete_context(undefined, Procedure, State, Data); handle_event(info, #aaa_request{procedure = {gx, 'RAR'}, events = Events} = Request, @@ -553,8 +555,12 @@ handle_event(internal, {session, {update_credits, _} = CreditEv, _}, _State, end, {keep_state, Data#{pfcp := PCtx, pcc := PCC}}; +%% Enable AAA to provide reason for session stop +handle_event(internal, {session, {stop, Reason}, _Session}, State, Data) -> + delete_context(undefined, Reason, State, Data); + handle_event(internal, {session, stop, _Session}, State, Data) -> - delete_context(undefined, normal, State, Data); + delete_context(undefined, error, State, Data); handle_event(internal, {session, Ev, _}, _State, _Data) -> ?LOG(error, "unhandled session event: ~p", [Ev]), @@ -569,6 +575,9 @@ handle_event(info, {timeout, TRef, pfcp_timer} = Info, _State, #{pfcp := PCtx0} ergw_gtp_gsn_lib:triggered_charging_event(validity_time, Now, ChargingKeys, Data), {keep_state, Data}; +handle_event({call, From}, {delete_context, Reason}, State, Data) + when State == connected; State == connecting -> + delete_context(From, Reason, State, Data); handle_event({call, From}, delete_context, State, Data) when State == connected; State == connecting -> delete_context(From, administrative, State, Data); @@ -598,8 +607,8 @@ handle_event(cast, {usage_report, URRActions, UsageReport}, _State, Data) -> ergw_gtp_gsn_lib:usage_report(URRActions, UsageReport, Data), keep_state_and_data; -handle_event(cast, delete_context, State, Data) -> - delete_context(undefined, administrative, State, Data); +handle_event(cast, {delete_context, Reason}, State, Data) -> + delete_context(undefined, Reason, State, Data); handle_event(info, {'DOWN', _MonitorRef, Type, Pid, _Info}, State, #{pfcp := #pfcp_ctx{node = Pid}} = Data) @@ -861,11 +870,11 @@ fteid_tunnel_side_f(#f_teid{ipv4 = IPv4, ipv6 = IPv6, teid = TEID}, fteid_tunnel_side_f(FqTEID, {_, _, Iter}) -> fteid_tunnel_side_f(FqTEID, maps:next(Iter)). -close_context(Side, TermCause, State, #{interface := Interface} = Data) -> - Interface:close_context(Side, TermCause, State, Data). +close_context(Side, Reason, State, #{interface := Interface} = Data) -> + Interface:close_context(Side, Reason, State, Data). -delete_context(From, TermCause, State, #{interface := Interface} = Data) -> - Interface:delete_context(From, TermCause, State, Data). +delete_context(From, Reason, State, #{interface := Interface} = Data) -> + Interface:delete_context(From, Reason, State, Data). %%==================================================================== %% asynchrounus usage reporting diff --git a/src/pgw_s5s8.erl b/src/pgw_s5s8.erl index 0da75850..275c97b0 100644 --- a/src/pgw_s5s8.erl +++ b/src/pgw_s5s8.erl @@ -36,6 +36,7 @@ -import(ergw_aaa_session, [to_session/1]). +-define(API, 's5/s8'). -define(GTP_v1_Interface, ggsn_gn). -define(T3, 10 * 1000). -define(N3, 5). @@ -383,7 +384,7 @@ handle_request(ReqKey, case match_tunnel(?'S5/S8-C SGW', LeftTunnel, FqTEID) of ok -> process_secondary_rat_usage_data_reports(IEs, Context, Session), - ergw_gtp_gsn_lib:close_context(normal, Data), + ergw_gtp_gsn_lib:close_context(?API, normal, Data), Response = response(delete_session_response, LeftTunnel, request_accepted), gtp_context:send_response(ReqKey, Request, Response), {next_state, shutdown, Data}; @@ -432,7 +433,7 @@ handle_response(_, timeout, #gtp{type = update_bearer_request}, connected = Stat handle_response({From, TermCause}, timeout, #gtp{type = delete_bearer_request}, _State, Data) -> - ergw_gtp_gsn_lib:close_context(TermCause, Data), + ergw_gtp_gsn_lib:close_context(?API, TermCause, Data), if is_tuple(From) -> gen_statem:reply(From, {error, timeout}); true -> ok end, @@ -449,7 +450,7 @@ handle_response({From, TermCause}, DataNew = Data#{left_tunnel => LeftTunnel}, process_secondary_rat_usage_data_reports(IEs, Context, Session), - ergw_gtp_gsn_lib:close_context(TermCause, DataNew), + ergw_gtp_gsn_lib:close_context(?API, TermCause, DataNew), if is_tuple(From) -> gen_statem:reply(From, {ok, RespCause}); true -> ok end, @@ -527,7 +528,7 @@ encode_paa(Type, IPv4, IPv6) -> #v2_pdn_address_allocation{type = Type, address = <>}. close_context(_Side, Reason, _State, Data) -> - ergw_gtp_gsn_lib:close_context(Reason, Data). + ergw_gtp_gsn_lib:close_context(?API, Reason, Data). map_attr('APN', #{?'Access Point Name' := #v2_access_point_name{apn = APN}}) -> unicode:characters_to_binary(lists:join($., APN)); diff --git a/src/saegw_s11.erl b/src/saegw_s11.erl index b37b3b88..3e2b8a91 100644 --- a/src/saegw_s11.erl +++ b/src/saegw_s11.erl @@ -32,6 +32,7 @@ -import(ergw_aaa_session, [to_session/1]). +-define(API, 's11'). -define(GTP_v1_Interface, ggsn_gn). -define(T3, 10 * 1000). -define(N3, 5). @@ -308,7 +309,7 @@ handle_request(ReqKey, case match_tunnel(?'S11-C MME', LeftTunnel, FqTEID) of ok -> - ergw_gtp_gsn_lib:close_context(normal, Data), + ergw_gtp_gsn_lib:close_context(?API, normal, Data), Response = response(delete_session_response, LeftTunnel, request_accepted), gtp_context:send_response(ReqKey, Request, Response), {next_state, shutdown, Data}; @@ -352,7 +353,7 @@ handle_response(_, timeout, #gtp{type = update_bearer_request}, connected = Stat handle_response({From, TermCause}, timeout, #gtp{type = delete_bearer_request}, _State, Data) -> - ergw_gtp_gsn_lib:close_context(TermCause, Data), + ergw_gtp_gsn_lib:close_context(?API, TermCause, Data), if is_tuple(From) -> gen_statem:reply(From, {error, timeout}); true -> ok end, @@ -366,7 +367,7 @@ handle_response({From, TermCause}, DataNew = Data#{left_tunnel => LeftTunnel}, - ergw_gtp_gsn_lib:close_context(TermCause, Data), + ergw_gtp_gsn_lib:close_context(?API, TermCause, Data), if is_tuple(From) -> gen_statem:reply(From, {ok, Cause}); true -> ok end, @@ -444,7 +445,7 @@ encode_paa(Type, IPv4, IPv6) -> #v2_pdn_address_allocation{type = Type, address = <>}. close_context(_Side, Reason, _State, Data) -> - ergw_gtp_gsn_lib:close_context(Reason, Data). + ergw_gtp_gsn_lib:close_context(?API, Reason, Data). copy_ppp_to_session({pap, 'PAP-Authentication-Request', _Id, Username, Password}, Session0) -> Session = Session0#{'Username' => Username, 'Password' => Password}, diff --git a/src/tdf.erl b/src/tdf.erl index 931b44e8..43bf3b7b 100644 --- a/src/tdf.erl +++ b/src/tdf.erl @@ -41,6 +41,7 @@ -import(ergw_aaa_session, [to_session/1]). +-define(API, tdf). -define(SERVER, ?MODULE). -define(TestCmdTag, '$TestCmd'). @@ -235,9 +236,9 @@ handle_event(info, #aaa_request{procedure = {_, 'RAR'}} = Request, shutdown, _Da ergw_aaa_session:response(Request, {error, unknown_session}, #{}, #{}), keep_state_and_data; -handle_event(info, #aaa_request{procedure = {_, 'ASR'}} = Request, State, Data) -> +handle_event(info, #aaa_request{procedure = {_, 'ASR'} = Procedure} = Request, State, Data) -> ergw_aaa_session:response(Request, ok, #{}, #{}), - close_pdn_context(administrative, State, Data), + close_pdn_context(Procedure, State, Data), {next_state, shutdown, Data}; handle_event(info, #aaa_request{procedure = {gx, 'RAR'}, @@ -324,6 +325,11 @@ handle_event(info, #aaa_request{procedure = {gy, 'RAR'}, triggered_charging_event(interim, Now, ChargingKeys, Data), keep_state_and_data; +%% Enable AAA to provide reason for session stop +handle_event(internal, {session, {stop, Reason}, _Session}, State, Data) -> + close_pdn_context(Reason, State, Data), + {next_state, shutdown, Data}; + handle_event(internal, {session, stop, _Session}, State, Data) -> close_pdn_context(normal, State, Data), {next_state, shutdown, Data}; @@ -503,8 +509,10 @@ ccr_initial(Session, API, SessionOpts, ReqOpts) -> {error, {'CCR-Initial', Fail}} end. -close_pdn_context(Reason, run, #data{pfcp = PCtx, session = Session}) -> - URRs = ergw_pfcp_context:delete_session(Reason, PCtx), +close_pdn_context(Reason, State, Data) when is_atom(Reason) -> + close_pdn_context({?API, Reason}, State, Data); +close_pdn_context({API, TermCause}, run, #data{pfcp = PCtx, session = Session}) -> + URRs = ergw_pfcp_context:delete_session(TermCause, PCtx), %% TODO: Monitors, AAA over SGi @@ -519,14 +527,14 @@ close_pdn_context(Reason, run, #data{pfcp = PCtx, session = Session}) -> ?LOG(warning, "Gx terminate failed with: ~p", [GxOther]) end, - ChargeEv = {terminate, Reason}, + ChargeEv = {terminate, TermCause}, {Online, Offline, Monitor} = ergw_pfcp_context:usage_report_to_charging_events(URRs, ChargeEv, PCtx), ergw_gsn_lib:process_accounting_monitor_events(ChargeEv, Monitor, Now, Session), GyReqServices = ergw_gsn_lib:gy_credit_report(Online), ergw_gsn_lib:process_online_charging_events(ChargeEv, GyReqServices, Session, ReqOpts), ergw_gsn_lib:process_offline_charging_events(ChargeEv, Offline, Now, Session), - ergw_prometheus:termination_cause(?FUNCTION_NAME, Reason), + ergw_prometheus:termination_cause(API, TermCause), ok; close_pdn_context(_Reason, _State, _Data) -> diff --git a/test/pgw_SUITE.erl b/test/pgw_SUITE.erl index 9cacee58..f77065f4 100644 --- a/test/pgw_SUITE.erl +++ b/test/pgw_SUITE.erl @@ -4911,6 +4911,8 @@ gy_async_stop() -> [{doc, "Check that a error/stop from async session call terminates the context"}]. gy_async_stop(Config) -> Cntl = whereis(gtpc_client_server), + MfrId = [gy, peer_reject], + MfrCnt = get_metric(prometheus_counter, termination_cause_total, MfrId, 0), {GtpC, _, _} = create_session(Config), @@ -4924,6 +4926,8 @@ gy_async_stop(Config) -> ok = meck:wait(?HUT, terminate, '_', ?TIMEOUT), wait4contexts(?TIMEOUT), + ?match_metric(prometheus_counter, termination_cause_total, MfrId, MfrCnt + 1), + meck_validate(Config), ok.