From a923a545ab1aeda3807578cde0d95b009799bb67 Mon Sep 17 00:00:00 2001 From: vedganes Date: Tue, 29 Nov 2022 13:51:35 -0500 Subject: [PATCH 1/2] [routesync] Fix for stale dynamic neighbor Signed-off-by: vedganes The changes are for fixing stale neighbor in the ASIC_DB and data path when eBGP neighbors are shutdown and neighbors are flushed. The problem is described in issue: https://github.com/sonic-net/sonic-buildimage/issues/12442 The root cause of this issue is due to not deleing the route from the ASIC_DB when the route's next hop is on eth0 or docker0 interface. The solution is to delete the route entry from ASIC_DB instead of just returning when the route's next hop is on the interface eth0 or docker0 --- fpmsyncd/routesync.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fpmsyncd/routesync.cpp b/fpmsyncd/routesync.cpp index ca00d4f77d..9459c17b31 100644 --- a/fpmsyncd/routesync.cpp +++ b/fpmsyncd/routesync.cpp @@ -733,6 +733,17 @@ void RouteSync::onRouteMsg(int nlmsg_type, struct nl_object *obj, char *vrf) { SWSS_LOG_DEBUG("Skip routes to eth0 or docker0: %s %s %s", destipprefix, gw_list.c_str(), intf_list.c_str()); + // If intf_list has only this interface, that means all of the next hops of this route + // have been removed and the next hop on the eth0/docker0 has become the only next hop. + // In this case since we do not want the route with next hop on eth0/docker0, we return. + // But still we need to clear the route from the APPL_DB. Otherwise the APPL_DB and data + // path will be left with stale route entry + if(alsv.size() == 1) + { + SWSS_LOG_NOTICE("RouteTable del msg for route with only one nh on eth0/docker0: %s %s %s %s", + destipprefix, gw_list.c_str(), intf_list.c_str(), mpls_list.c_str()); + m_routeTable.del(destipprefix); + } return; } } From e5f590c484f66509f7d6da1cd467f2310abb4c15 Mon Sep 17 00:00:00 2001 From: vedganes Date: Tue, 10 Jan 2023 12:35:43 -0500 Subject: [PATCH 2/2] [routesync] Fix to handle warm restart Signed-off-by: vedganes This commit fixes the warm restart unit test failure. When the route with only nh on eth0 or docker0 is removed and if the route is the default route, orchagent sends "add" black hole route to the syncd. So the ASIC DB gets n hset message. When this happens during warm restart, the unit test identifies this as unwanted setting and the unit test fails. To fix this issues, the route delete is sent only if the warm restart is not in progress. This is done following the same warm restart handling approach used for route delete in other palces. --- fpmsyncd/routesync.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fpmsyncd/routesync.cpp b/fpmsyncd/routesync.cpp index 9459c17b31..e87b9fa323 100644 --- a/fpmsyncd/routesync.cpp +++ b/fpmsyncd/routesync.cpp @@ -740,9 +740,24 @@ void RouteSync::onRouteMsg(int nlmsg_type, struct nl_object *obj, char *vrf) // path will be left with stale route entry if(alsv.size() == 1) { - SWSS_LOG_NOTICE("RouteTable del msg for route with only one nh on eth0/docker0: %s %s %s %s", - destipprefix, gw_list.c_str(), intf_list.c_str(), mpls_list.c_str()); - m_routeTable.del(destipprefix); + if (!warmRestartInProgress) + { + SWSS_LOG_NOTICE("RouteTable del msg for route with only one nh on eth0/docker0: %s %s %s %s", + destipprefix, gw_list.c_str(), intf_list.c_str(), mpls_list.c_str()); + + m_routeTable.del(destipprefix); + } + else + { + SWSS_LOG_NOTICE("Warm-Restart mode: Receiving delete msg for route with only nh on eth0/docker0: %s %s %s %s", + destipprefix, gw_list.c_str(), intf_list.c_str(), mpls_list.c_str()); + + vector fvVector; + const KeyOpFieldsValuesTuple kfv = std::make_tuple(destipprefix, + DEL_COMMAND, + fvVector); + m_warmStartHelper.insertRefreshMap(kfv); + } } return; }