Skip to content

Commit

Permalink
Teamd :: fix for cleaning up the teamd processes correctly on teamd d…
Browse files Browse the repository at this point in the history
…ocker stop (sonic-net#1159)

* Send explicit signal to the teamd processes whenthe teamd docker exits.

When the teamd docker receives a stop signal, only the processes started by supervisord gets
the SIGTERM, so this fix is to propogate the signal to teamd processes
via the signal handler in teamsyncd process.

* Updates to take care of boundary conditions in the teamsyncd signal handler.

* Better way of signal Handling by setting a flag in the signal handler and checking for the flag in the main loop.
This way the cleanUp handler is not run in the signal Handler context and can add more Logs as we
need not care for signal safety now.

* Updated the logic so that teammgrd controls the lifecycle of teamd.
Teammgrd tracks the PID's of the teamd processes and sents the SIGTERM
signal when the teamd docker is stopped.

* Minor change in the function defenition

* Updates based on the comments

* Minor update in teammgr.cpp
  • Loading branch information
judyjoseph authored and abdosi committed Jan 21, 2020
1 parent 3e53c3e commit 15441fe
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 0 deletions.
79 changes: 79 additions & 0 deletions cfgmgr/teammgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@

#include <algorithm>
#include <sstream>
#include <fstream>
#include <thread>

#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <signal.h>

#define PID_FILE_PATH "/var/run/teamd/"

using namespace std;
using namespace swss;
Expand Down Expand Up @@ -108,6 +112,79 @@ void TeamMgr::doTask(Consumer &consumer)
}
}


pid_t TeamMgr::getTeamPid(const string &alias)
{
SWSS_LOG_ENTER();
pid_t pid = 0;

string file = string(PID_FILE_PATH) + alias + string(".pid");
ifstream infile(file);
if (!infile.is_open())
{
SWSS_LOG_WARN("The LAG PID file: %s is not readable", file.c_str());
return 0;
}

string line;
getline(infile, line);
if (line.empty())
{
SWSS_LOG_WARN("The LAG PID file: %s is empty", file.c_str());
}
else
{
/*Store the PID value */
pid = stoi(line, nullptr, 10);
}

/* Close the file and return */
infile.close();

return pid;
}


void TeamMgr::addLagPid(const string &alias)
{
SWSS_LOG_ENTER();
m_lagPIDList[alias] = getTeamPid(alias);
}

void TeamMgr::removeLagPid(const string &alias)
{
SWSS_LOG_ENTER();
m_lagPIDList.erase(alias);
}

void TeamMgr::cleanTeamProcesses(int signo)
{
pid_t pid = 0;

for (const auto& it: m_lagList)
{
pid = m_lagPIDList[it];
if(!pid) {
SWSS_LOG_WARN("Invalid PID found for LaG %s ", it.c_str());

/* Try to get the PID again */
pid = getTeamPid(it);
}

if(pid > 0)
{
SWSS_LOG_INFO("Sending TERM Signal to (PID: %d) for LaG %s ", pid, it.c_str());
kill(pid, signo);
}
else
{
SWSS_LOG_ERROR("Can't send TERM signal to LAG %s. PID wasn't found", it.c_str());
}
}

return;
}

void TeamMgr::doLagTask(Consumer &consumer)
{
SWSS_LOG_ENTER();
Expand Down Expand Up @@ -171,6 +248,7 @@ void TeamMgr::doLagTask(Consumer &consumer)
}

m_lagList.insert(alias);
addLagPid(alias);
}

setLagAdminStatus(alias, admin_status);
Expand All @@ -187,6 +265,7 @@ void TeamMgr::doLagTask(Consumer &consumer)
{
removeLag(alias);
m_lagList.erase(alias);
removeLagPid(alias);
}
}

Expand Down
8 changes: 8 additions & 0 deletions cfgmgr/teammgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "netmsg.h"
#include "orch.h"
#include "producerstatetable.h"
#include <sys/types.h>

namespace swss {

Expand All @@ -17,6 +18,8 @@ class TeamMgr : public Orch
const std::vector<TableConnector> &tables);

using Orch::doTask;
void cleanTeamProcesses(int signo);

private:
Table m_cfgMetadataTable; // To retrieve MAC address
Table m_cfgPortTable;
Expand All @@ -29,6 +32,7 @@ class TeamMgr : public Orch
ProducerStateTable m_appLagTable;

std::set<std::string> m_lagList;
std::map<std::string, pid_t> m_lagPIDList;

MacAddress m_mac;

Expand All @@ -45,6 +49,10 @@ class TeamMgr : public Orch
bool setLagAdminStatus(const std::string &alias, const std::string &admin_status);
bool setLagMtu(const std::string &alias, const std::string &mtu);
bool setLagLearnMode(const std::string &alias, const std::string &learn_mode);

pid_t getTeamPid(const std::string &alias);
void addLagPid(const std::string &alias);
void removeLagPid(const std::string &alias);

bool isPortEnslaved(const std::string &);
bool findPortMaster(std::string &, const std::string &);
Expand Down
18 changes: 18 additions & 0 deletions cfgmgr/teammgrd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "netlink.h"
#include "select.h"
#include "warm_restart.h"
#include <signal.h>

using namespace std;
using namespace swss;
Expand All @@ -17,13 +18,24 @@ bool gLogRotate = false;
ofstream gRecordOfs;
string gRecordFile;

bool received_sigterm = false;

void sig_handler(int signo)
{
received_sigterm = true;
return;
}

int main(int argc, char **argv)
{
Logger::linkToDbNative("teammgrd");
SWSS_LOG_ENTER();

SWSS_LOG_NOTICE("--- Starting teammrgd ---");

/* Register the signal handler for SIGTERM */
signal(SIGTERM, sig_handler);

try
{
DBConnector conf_db("CONFIG_DB", 0);
Expand Down Expand Up @@ -55,6 +67,12 @@ int main(int argc, char **argv)

while (true)
{
if(received_sigterm)
{
teammgr.cleanTeamProcesses(SIGTERM);
received_sigterm = false;
}

Selectable *sel;
int ret;

Expand Down
26 changes: 26 additions & 0 deletions teamsyncd/teamsync.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,22 @@ void TeamSync::onMsg(int nlmsg_type, struct nl_object *obj)
if (!type || (strcmp(type, TEAM_DRV_NAME) != 0))
return;

unsigned int flags = rtnl_link_get_flags(link);
bool admin = flags & IFF_UP;
bool oper = flags & IFF_LOWER_UP;
unsigned int ifindex = rtnl_link_get_ifindex(link);

if (type)
{
SWSS_LOG_INFO(" nlmsg type:%d key:%s admin:%d oper:%d ifindex:%d type:%s",
nlmsg_type, lagName.c_str(), admin, oper, ifindex, type);
}
else
{
SWSS_LOG_INFO(" nlmsg type:%d key:%s admin:%d oper:%d ifindex:%d",
nlmsg_type, lagName.c_str(), admin, oper, ifindex);
}

if (nlmsg_type == RTM_DELLINK)
{
if (m_teamSelectables.find(lagName) != m_teamSelectables.end())
Expand Down Expand Up @@ -194,6 +210,16 @@ void TeamSync::removeLag(const string &lagName)
m_selectablesToRemove.insert(lagName);
}

void TeamSync::cleanTeamSync()
{
for (const auto& it: m_teamSelectables)
{
/* Cleanup LAG */
removeLag(it.first);
}
return;
}

const struct team_change_handler TeamSync::TeamPortSync::gPortChangeHandler = {
.func = TeamSync::TeamPortSync::teamdHandler,
.type_mask = TEAM_PORT_CHANGE | TEAM_OPTION_CHANGE
Expand Down
1 change: 1 addition & 0 deletions teamsyncd/teamsync.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class TeamSync : public NetMsg
TeamSync(DBConnector *db, DBConnector *stateDb, Select *select);

void periodic();
void cleanTeamSync();

/* Listen to RTM_NEWLINK, RTM_DELLINK to track team devices */
virtual void onMsg(int nlmsg_type, struct nl_object *obj);
Expand Down
18 changes: 18 additions & 0 deletions teamsyncd/teamsyncd.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <iostream>
#include <team.h>
#include <signal.h>
#include "logger.h"
#include "select.h"
#include "netdispatcher.h"
Expand All @@ -9,6 +10,14 @@
using namespace std;
using namespace swss;

bool received_sigterm = false;

void sig_handler(int signo)
{
received_sigterm = true;
return;
}

int main(int argc, char **argv)
{
swss::Logger::linkToDbNative(TEAMSYNCD_APP_NAME);
Expand All @@ -20,6 +29,9 @@ int main(int argc, char **argv)
NetDispatcher::getInstance().registerMessageHandler(RTM_NEWLINK, &sync);
NetDispatcher::getInstance().registerMessageHandler(RTM_DELLINK, &sync);

/* Register the signal handler for SIGTERM */
signal(SIGTERM, sig_handler);

while (1)
{
try
Expand All @@ -33,6 +45,12 @@ int main(int argc, char **argv)
s.addSelectable(&netlink);
while (true)
{
if(received_sigterm)
{
sync.cleanTeamSync();
received_sigterm = false;
}

Selectable *temps;
s.select(&temps, 1000); // block for a second
sync.periodic();
Expand Down

0 comments on commit 15441fe

Please sign in to comment.