From 5f294cf1d48a4679c1134b785058214c0335e5fd Mon Sep 17 00:00:00 2001 From: Stephen Sun <5379172+stephenxs@users.noreply.github.com> Date: Tue, 15 Aug 2023 00:38:57 +0800 Subject: [PATCH] [Dynamic Buffer][Mellanox] Skip PGs in pending deleting set while checking accumulative headroom of a port (#2871) **What I did** Skip the PGs that are about to be removed in the Lua script while checking the accumulative headroom of a port. **Why I did it** This is to avoid the following error message ``` Jul 26 14:59:04.754566 r-moose-02 ERR swss#buffermgrd: :- guard: RedisReply catches system_error: command: .*, reason: ERR Error running script (call to f_a02f6f856d876d607a7ac81f5bc0890cad68bf71): @user_script:125: user_script:125: attempt to perform arithmetic on local 'current_profile_size' (a nil value): Input/output error ``` This is because 1. Too many notifications were accumulated in the `m_toSync` queue, belonging to `BUFFER_PROFILE_TABLE` and `BUFFER_PG_TABLE` 2. Even the buffer manager removed the buffer PG ahead of buffer profile, the buffer profile was handled ahead of buffer PG in the orchagent, which means they were handled in reverse order. As a result, the notification of buffer PG was still in `BUFFER_PG_TABLE_DELSET` set and remained in `BUFFER_PG_TABLE` while the buffer profile was removed from `BUFFER_PROFILE_TABLE`. 3. When it checked the accumulative headroom, it fetched all items from the APPL_DB tables and got the to-be-removed buffer PG but didn't get the buffer profile because it had been removed in 2. 4. As a result, it complained the `1current_profile_size` was nil which was the consequence of 3. Fix: Do not check buffer PGs that are in the `BUFFER_PG_TABLE_DELSET`. **How I verified it** Regression and manual test. --- cfgmgr/buffer_check_headroom_mellanox.lua | 17 +++++++++++++++-- tests/test_buffer_dynamic.py | 13 +++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/cfgmgr/buffer_check_headroom_mellanox.lua b/cfgmgr/buffer_check_headroom_mellanox.lua index 6ae5b883ba..1b6851f77d 100644 --- a/cfgmgr/buffer_check_headroom_mellanox.lua +++ b/cfgmgr/buffer_check_headroom_mellanox.lua @@ -82,12 +82,25 @@ local function get_number_of_pgs(keyname) return size end +-- Fetch all the pending removing PGs +local pending_remove_pg_keys = redis.call('SMEMBERS', 'BUFFER_PG_TABLE_DEL_SET') +local pending_remove_pg_set = {} +for i = 1, #pending_remove_pg_keys do + pending_remove_pg_set['BUFFER_PG_TABLE:' .. pending_remove_pg_keys[i]] = true + table.insert(debuginfo, 'debug:pending remove entry found: ' .. 'BUFFER_PG_TABLE:' .. pending_remove_pg_keys[i]) +end + -- Fetch all the PGs in APPL_DB, and store them into a hash table +-- But skip the items that are in pending_remove_pg_set local pg_keys = redis.call('KEYS', 'BUFFER_PG_TABLE:' .. port .. ':*') local all_pgs = {} for i = 1, #pg_keys do - local profile = redis.call('HGET', pg_keys[i], 'profile') - all_pgs[pg_keys[i]] = profile + if not pending_remove_pg_set[pg_keys[i]] then + local profile = redis.call('HGET', pg_keys[i], 'profile') + all_pgs[pg_keys[i]] = profile + else + table.insert(debuginfo, 'debug:pending remove entry skipped: ' .. pg_keys[i]) + end end -- Fetch all the pending PGs, and store them into the hash table diff --git a/tests/test_buffer_dynamic.py b/tests/test_buffer_dynamic.py index 02a06569bd..49d36b357c 100644 --- a/tests/test_buffer_dynamic.py +++ b/tests/test_buffer_dynamic.py @@ -833,6 +833,19 @@ def test_bufferPortMaxParameter(self, dvs, testlog): profile_fvs['xoff'] = '9216' self.config_db.update_entry('BUFFER_PROFILE', 'test', profile_fvs) self.app_db.wait_for_field_match('BUFFER_PROFILE_TABLE', 'test', profile_fvs) + + # Verify a pending remove PG is not counted into the accumulative headroom + dvs.runcmd("kill -s SIGSTOP {}".format(oa_pid)) + + self.config_db.delete_entry('BUFFER_PG', 'Ethernet0|3-4') + # Should be added because PG 3-4 has been removed and there are sufficient headroom + self.config_db.update_entry('BUFFER_PG', 'Ethernet0|1', {'profile': 'ingress_lossy_profile'}) + + # Resume orchagent + dvs.runcmd("kill -s SIGCONT {}".format(oa_pid)) + + # Check whether BUFFER_PG_TABLE is updated as expected + self.app_db.wait_for_field_match("BUFFER_PG_TABLE", "Ethernet0:1", {"profile": "ingress_lossy_profile"}) finally: dvs.runcmd("kill -s SIGCONT {}".format(oa_pid))