Skip to content

Commit

Permalink
Prevent other notification event storms to keep enqueue unchecked and…
Browse files Browse the repository at this point in the history
… drained all memory that leads to crashing the switch router (sonic-net#968)
  • Loading branch information
gechiang authored Nov 29, 2021
1 parent 0cb253a commit 26a8a12
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 7 deletions.
50 changes: 45 additions & 5 deletions syncd/NotificationQueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@ using namespace syncd;
#define MUTEX std::lock_guard<std::mutex> _lock(m_mutex);

NotificationQueue::NotificationQueue(
_In_ size_t queueLimit):
_In_ size_t queueLimit,
_In_ size_t consecutiveThresholdLimit):
m_queueSizeLimit(queueLimit),
m_dropCount(0)
m_thresholdLimit(consecutiveThresholdLimit),
m_dropCount(0),
m_lastEventCount(0),
m_lastEvent(SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT)
{
SWSS_LOG_ENTER();

Expand All @@ -30,16 +34,52 @@ bool NotificationQueue::enqueue(
MUTEX;

SWSS_LOG_ENTER();
bool candidateToDrop = false;
std::string currentEvent;

/*
* If the queue exceeds the limit, then drop all further FDB events This is
* a temporary solution to handle high memory usage by syncd and the
* notification queue keeps growing. The permanent solution would be to
* make this stateful so that only the *latest* event is published.
*
* We have also seen other notification storms that can also cause this queue issue
* So the new scheme is to keep the last notification event and its consecutive count
* If threshold limit reached and the consecutive count also reached then this notification
* will also be dropped regardless of its event type to protect the device from crashing due to
* running out of memory
*/
auto queueSize = m_queue.size();
currentEvent = kfvKey(item);
if (currentEvent == m_lastEvent)
{
m_lastEventCount++;
}
else
{
m_lastEventCount = 1;
m_lastEvent = currentEvent;
}
if (queueSize >= m_queueSizeLimit)
{
/* Too many queued up already check if notification fits condition to e dropped
* 1. All FDB events should be dropped at this point.
* 2. All other notification events will start to drop if it reached the consecutive threshold limit
*/
if (currentEvent == SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT)
{
candidateToDrop = true;
}
else
{
if (m_lastEventCount >= m_thresholdLimit)
{
candidateToDrop = true;
}
}
}

if (queueSize < m_queueSizeLimit || kfvKey(item) != SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT) // TODO use enum instead of strings
if (!candidateToDrop)
{
m_queue.push(item);

Expand All @@ -51,9 +91,9 @@ bool NotificationQueue::enqueue(
if (!(m_dropCount % NOTIFICATION_QUEUE_DROP_COUNT_INDICATOR))
{
SWSS_LOG_NOTICE(
"Too many messages in queue (%zu), dropped %zu FDB events!",
"Too many messages in queue (%zu), dropped (%zu), lastEventCount (%zu) Dropping %s !",
queueSize,
m_dropCount);
m_dropCount, m_lastEventCount, m_lastEvent.c_str());
}

return false;
Expand Down
17 changes: 16 additions & 1 deletion syncd/NotificationQueue.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,17 @@ extern "C" {
* Value based on typical L2 deployment with 256k MAC entries and
* some extra buffer for other events like port-state, q-deadlock etc
*
* Note: We recently found a case where SAI continuously sending switch notification events
* that also caused the queue to keep growing and eventually exhaust all system memory and crashed.
* So a new detection/limit scheme is being implemented by keeping track of the last Event
* and if the current Event matches the last Event, then the last Event Count will get
* incremented and this count will also be used as part of the equation to ensure this
* notification should also be dropped if the queue size limit has already reached and not
* just dropping FDB events only.
* TODO: move to config, also this limit only applies to fdb notifications
*/
#define DEFAULT_NOTIFICATION_QUEUE_SIZE_LIMIT (300000)
#define DEFAULT_NOTIFICATION_CONSECUTIVE_THRESHOLD (1000)

namespace syncd
{
Expand All @@ -27,7 +35,8 @@ namespace syncd
public:

NotificationQueue(
_In_ size_t limit = DEFAULT_NOTIFICATION_QUEUE_SIZE_LIMIT);
_In_ size_t limit = DEFAULT_NOTIFICATION_QUEUE_SIZE_LIMIT,
_In_ size_t consecutiveThresholdLimit = DEFAULT_NOTIFICATION_CONSECUTIVE_THRESHOLD);

virtual ~NotificationQueue();

Expand All @@ -49,6 +58,12 @@ namespace syncd

size_t m_queueSizeLimit;

size_t m_thresholdLimit;

size_t m_dropCount;

size_t m_lastEventCount;

std::string m_lastEvent;
};
}
3 changes: 2 additions & 1 deletion unittest/syncd/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ tests_SOURCES = main.cpp \
MockableSaiInterface.cpp \
MockHelper.cpp \
TestCommandLineOptions.cpp \
TestFlexCounter.cpp
TestFlexCounter.cpp \
TestNotificationQueue.cpp

tests_CXXFLAGS = $(DBGFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS_COMMON)
tests_LDADD = $(LDADD_GTEST) $(top_srcdir)/syncd/libSyncd.a -lhiredis -lswsscommon -lpthread -L$(top_srcdir)/lib/.libs -lsairedis -L$(top_srcdir)/meta/.libs -lsaimetadata -lsaimeta -lzmq $(CODE_COVERAGE_LIBS)
Expand Down
68 changes: 68 additions & 0 deletions unittest/syncd/TestNotificationQueue.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include "NotificationQueue.h"

#include "sairediscommon.h"

#include <gtest/gtest.h>

using namespace syncd;

static std::string fdbData =
"[{\"fdb_entry\":\"{\\\"bvid\\\":\\\"oid:0x260000000005be\\\",\\\"mac\\\":\\\"52:54:00:86:DD:7A\\\",\\\"switch_id\\\":\\\"oid:0x21000000000000\\\"}\","
"\"fdb_event\":\"SAI_FDB_EVENT_LEARNED\","
"\"list\":[{\"id\":\"SAI_FDB_ENTRY_ATTR_TYPE\",\"value\":\"SAI_FDB_ENTRY_TYPE_DYNAMIC\"},{\"id\":\"SAI_FDB_ENTRY_ATTR_BRIDGE_PORT_ID\",\"value\":\"oid:0x3a000000000660\"}]}]";

static std::string sscData = "{\"status\":\"SAI_SWITCH_OPER_STATUS_UP\",\"switch_id\":\"oid:0x2100000000\"}";

TEST(NotificationQueue, EnqueueLimitTest)
{
bool status;
int i;
std::vector<swss::FieldValueTuple> fdbEntry;
std::vector<swss::FieldValueTuple> sscEntry;

// Set up a queue with limit at 5 and threshold at 3 where after this is reached event starts dropping
syncd::NotificationQueue testQ(5, 3);

// Try queue up 5 fake FDB event and expect them to be added successfully
swss::KeyOpFieldsValuesTuple fdbItem(SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT, fdbData, fdbEntry);
for (i = 0; i < 5; ++i)
{
status = testQ.enqueue(fdbItem);
EXPECT_EQ(status, true);
}

// On the 6th fake FDB event expect it to be dropped right away
status = testQ.enqueue(fdbItem);
EXPECT_EQ(status, false);

// Add 2 switch state change events expect both are accepted as consecutive limit not yet reached
swss::KeyOpFieldsValuesTuple sscItem(SAI_SWITCH_NOTIFICATION_NAME_SWITCH_STATE_CHANGE, sscData, sscEntry);
for (i = 0; i < 2; ++i)
{
status = testQ.enqueue(sscItem);
EXPECT_EQ(status, true);
}

// On the 3rd consecutive switch state change event expect it to be dropped
status = testQ.enqueue(sscItem);
EXPECT_EQ(status, false);

// Add a fake FDB event to cause the consecutive event signature to change while this FDB event is dropped
status = testQ.enqueue(fdbItem);
EXPECT_EQ(status, false);

// Add 2 switch state change events expect both are accepted as consecutive limit not yet reached
for (i = 0; i < 2; ++i)
{
status = testQ.enqueue(sscItem);
EXPECT_EQ(status, true);
}

// Add 2 switch state change events expect both are dropped as consecutive limit has already reached
for (i = 0; i < 2; ++i)
{
status = testQ.enqueue(sscItem);
EXPECT_EQ(status, false);
}
}

0 comments on commit 26a8a12

Please sign in to comment.