From 5acd957a986c167d357e617a887630203be29ea4 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 1 Oct 2024 13:37:04 +0300 Subject: [PATCH 1/6] net/mlx5: hw counters: Make fc_stats & fc_pool private The mlx5_fc_stats and mlx5_fc_pool structs are only used from fs_counters.c. As such, make them private there. mlx5_fc_pool is not used or referenced at all outside fs_counters. mlx5_fc_stats is referenced from mlx5_core_dev, so instead of having it as a direct member (which requires exporting it from fs_counters), store a pointer to it, allocate it on init and clear it on destroy. One caveat is that a simple container_of to get from a 'work' struct to the outermost mlx5_core_dev struct directly no longer works, so an extra pointer had to be added to mlx5_fc_stats back to the parent dev. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241001103709.58127-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/fs_counters.c | 79 ++++++++++++++----- include/linux/mlx5/driver.h | 33 +------- 2 files changed, 60 insertions(+), 52 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 0c26d707eed27b..7d6174d0f26046 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -69,6 +69,36 @@ struct mlx5_fc { struct mlx5_fc_cache cache ____cacheline_aligned_in_smp; }; +struct mlx5_fc_pool { + struct mlx5_core_dev *dev; + struct mutex pool_lock; /* protects pool lists */ + struct list_head fully_used; + struct list_head partially_used; + struct list_head unused; + int available_fcs; + int used_fcs; + int threshold; +}; + +struct mlx5_fc_stats { + spinlock_t counters_idr_lock; /* protects counters_idr */ + struct idr counters_idr; + struct list_head counters; + struct llist_head addlist; + struct llist_head dellist; + + struct workqueue_struct *wq; + struct delayed_work work; + unsigned long next_query; + unsigned long sampling_interval; /* jiffies */ + u32 *bulk_query_out; + int bulk_query_len; + size_t num_counters; + bool bulk_query_alloc_failed; + unsigned long next_bulk_query_alloc; + struct mlx5_fc_pool fc_pool; +}; + static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev); static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool); static struct mlx5_fc *mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool); @@ -109,7 +139,7 @@ static void mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct ml static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev, u32 id) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; unsigned long next_id = (unsigned long)id + 1; struct mlx5_fc *counter; unsigned long tmp; @@ -137,7 +167,7 @@ static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev, static void mlx5_fc_stats_remove(struct mlx5_core_dev *dev, struct mlx5_fc *counter) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; list_del(&counter->list); @@ -178,7 +208,7 @@ static void mlx5_fc_stats_query_counter_range(struct mlx5_core_dev *dev, struct mlx5_fc *first, u32 last_id) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; bool query_more_counters = (first->id <= last_id); int cur_bulk_len = fc_stats->bulk_query_len; u32 *data = fc_stats->bulk_query_out; @@ -225,7 +255,7 @@ static void mlx5_fc_free(struct mlx5_core_dev *dev, struct mlx5_fc *counter) static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; if (counter->bulk) mlx5_fc_pool_release_counter(&fc_stats->fc_pool, counter); @@ -235,7 +265,7 @@ static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter) static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; int max_bulk_len = get_max_bulk_query_len(dev); unsigned long now = jiffies; u32 *bulk_query_out_tmp; @@ -270,9 +300,9 @@ static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev) static void mlx5_fc_stats_work(struct work_struct *work) { - struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev, - priv.fc_stats.work.work); - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = container_of(work, struct mlx5_fc_stats, + work.work); + struct mlx5_core_dev *dev = fc_stats->fc_pool.dev; /* Take dellist first to ensure that counters cannot be deleted before * they are inserted. */ @@ -334,7 +364,7 @@ static struct mlx5_fc *mlx5_fc_single_alloc(struct mlx5_core_dev *dev) static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; struct mlx5_fc *counter; if (aging && MLX5_CAP_GEN(dev, flow_counter_bulk_alloc) != 0) { @@ -349,7 +379,7 @@ static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging) struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging) { struct mlx5_fc *counter = mlx5_fc_acquire(dev, aging); - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; int err; if (IS_ERR(counter)) @@ -389,7 +419,7 @@ struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging) struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging) { struct mlx5_fc *counter = mlx5_fc_create_ex(dev, aging); - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; if (aging) mod_delayed_work(fc_stats->wq, &fc_stats->work, 0); @@ -405,7 +435,7 @@ EXPORT_SYMBOL(mlx5_fc_id); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; if (!counter) return; @@ -422,10 +452,14 @@ EXPORT_SYMBOL(mlx5_fc_destroy); int mlx5_init_fc_stats(struct mlx5_core_dev *dev) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats; int init_bulk_len; int init_out_len; + fc_stats = kzalloc(sizeof(*fc_stats), GFP_KERNEL); + if (!fc_stats) + return -ENOMEM; + spin_lock_init(&fc_stats->counters_idr_lock); idr_init(&fc_stats->counters_idr); INIT_LIST_HEAD(&fc_stats->counters); @@ -436,7 +470,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev) init_out_len = mlx5_cmd_fc_get_bulk_query_out_len(init_bulk_len); fc_stats->bulk_query_out = kzalloc(init_out_len, GFP_KERNEL); if (!fc_stats->bulk_query_out) - return -ENOMEM; + goto err_bulk; fc_stats->bulk_query_len = init_bulk_len; fc_stats->wq = create_singlethread_workqueue("mlx5_fc"); @@ -447,23 +481,27 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev) INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work); mlx5_fc_pool_init(&fc_stats->fc_pool, dev); + dev->priv.fc_stats = fc_stats; + return 0; err_wq_create: kfree(fc_stats->bulk_query_out); +err_bulk: + kfree(fc_stats); return -ENOMEM; } void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; struct llist_node *tmplist; struct mlx5_fc *counter; struct mlx5_fc *tmp; - cancel_delayed_work_sync(&dev->priv.fc_stats.work); - destroy_workqueue(dev->priv.fc_stats.wq); - dev->priv.fc_stats.wq = NULL; + cancel_delayed_work_sync(&fc_stats->work); + destroy_workqueue(fc_stats->wq); + fc_stats->wq = NULL; tmplist = llist_del_all(&fc_stats->addlist); llist_for_each_entry_safe(counter, tmp, tmplist, addlist) @@ -475,6 +513,7 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev) mlx5_fc_pool_cleanup(&fc_stats->fc_pool); idr_destroy(&fc_stats->counters_idr); kfree(fc_stats->bulk_query_out); + kfree(fc_stats); } int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, @@ -518,7 +557,7 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev, struct delayed_work *dwork, unsigned long delay) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; queue_delayed_work(fc_stats->wq, dwork, delay); } @@ -526,7 +565,7 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev, void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev, unsigned long interval) { - struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; fc_stats->sampling_interval = min_t(unsigned long, interval, fc_stats->sampling_interval); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e23c692a34c702..fc7e6153b73d9e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -45,7 +45,6 @@ #include #include #include -#include #include #include #include @@ -474,36 +473,6 @@ struct mlx5_core_sriov { u16 max_ec_vfs; }; -struct mlx5_fc_pool { - struct mlx5_core_dev *dev; - struct mutex pool_lock; /* protects pool lists */ - struct list_head fully_used; - struct list_head partially_used; - struct list_head unused; - int available_fcs; - int used_fcs; - int threshold; -}; - -struct mlx5_fc_stats { - spinlock_t counters_idr_lock; /* protects counters_idr */ - struct idr counters_idr; - struct list_head counters; - struct llist_head addlist; - struct llist_head dellist; - - struct workqueue_struct *wq; - struct delayed_work work; - unsigned long next_query; - unsigned long sampling_interval; /* jiffies */ - u32 *bulk_query_out; - int bulk_query_len; - size_t num_counters; - bool bulk_query_alloc_failed; - unsigned long next_bulk_query_alloc; - struct mlx5_fc_pool fc_pool; -}; - struct mlx5_events; struct mlx5_mpfs; struct mlx5_eswitch; @@ -630,7 +599,7 @@ struct mlx5_priv { struct mlx5_devcom_comp_dev *hca_devcom_comp; struct mlx5_fw_reset *fw_reset; struct mlx5_core_roce roce; - struct mlx5_fc_stats fc_stats; + struct mlx5_fc_stats *fc_stats; struct mlx5_rl_table rl_table; struct mlx5_ft_pool *ft_pool; From 10cd92df833c3f6c35dc5e923651146d41332538 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 1 Oct 2024 13:37:05 +0300 Subject: [PATCH 2/6] net/mlx5: hw counters: Use kvmalloc for bulk query buffer The bulk query buffer starts out small (see [1]) and as soon as the number of counters goes past the initial threshold grows to max size (32K entries, 512KB) with a retry scheme. This commit switches to using kvmalloc for the buffer, which has a near zero likelihood of failing, and thus the explicit retry scheme becomes superfluous and is taken out. On the low chance the allocation fails, it will still be retried every sampling_interval, when the wq task runs. [1] commit b247f32aecad ("net/mlx5: Dynamically resize flow counters query buffer") Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241001103709.58127-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/fs_counters.c | 59 +++++++------------ 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 7d6174d0f26046..9892895da9ee21 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -38,7 +38,6 @@ #include "fs_cmd.h" #define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000) -#define MLX5_FC_BULK_QUERY_ALLOC_PERIOD msecs_to_jiffies(180 * 1000) /* Max number of counters to query in bulk read is 32K */ #define MLX5_SW_MAX_COUNTERS_BULK BIT(15) #define MLX5_INIT_COUNTERS_BULK 8 @@ -263,39 +262,28 @@ static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter) mlx5_fc_free(dev, counter); } -static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev) +static void mlx5_fc_stats_bulk_query_buf_realloc(struct mlx5_core_dev *dev, + int bulk_query_len) { struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; - int max_bulk_len = get_max_bulk_query_len(dev); - unsigned long now = jiffies; u32 *bulk_query_out_tmp; - int max_out_len; - - if (fc_stats->bulk_query_alloc_failed && - time_before(now, fc_stats->next_bulk_query_alloc)) - return; + int out_len; - max_out_len = mlx5_cmd_fc_get_bulk_query_out_len(max_bulk_len); - bulk_query_out_tmp = kzalloc(max_out_len, GFP_KERNEL); + out_len = mlx5_cmd_fc_get_bulk_query_out_len(bulk_query_len); + bulk_query_out_tmp = kvzalloc(out_len, GFP_KERNEL); if (!bulk_query_out_tmp) { mlx5_core_warn_once(dev, - "Can't increase flow counters bulk query buffer size, insufficient memory, bulk_size(%d)\n", - max_bulk_len); - fc_stats->bulk_query_alloc_failed = true; - fc_stats->next_bulk_query_alloc = - now + MLX5_FC_BULK_QUERY_ALLOC_PERIOD; + "Can't increase flow counters bulk query buffer size, alloc failed, bulk_query_len(%d)\n", + bulk_query_len); return; } - kfree(fc_stats->bulk_query_out); + kvfree(fc_stats->bulk_query_out); fc_stats->bulk_query_out = bulk_query_out_tmp; - fc_stats->bulk_query_len = max_bulk_len; - if (fc_stats->bulk_query_alloc_failed) { - mlx5_core_info(dev, - "Flow counters bulk query buffer size increased, bulk_size(%d)\n", - max_bulk_len); - fc_stats->bulk_query_alloc_failed = false; - } + fc_stats->bulk_query_len = bulk_query_len; + mlx5_core_info(dev, + "Flow counters bulk query buffer size increased, bulk_query_len(%d)\n", + bulk_query_len); } static void mlx5_fc_stats_work(struct work_struct *work) @@ -327,13 +315,14 @@ static void mlx5_fc_stats_work(struct work_struct *work) fc_stats->num_counters--; } - if (fc_stats->bulk_query_len < get_max_bulk_query_len(dev) && - fc_stats->num_counters > get_init_bulk_query_len(dev)) - mlx5_fc_stats_bulk_query_size_increase(dev); - if (time_before(now, fc_stats->next_query) || list_empty(&fc_stats->counters)) return; + + if (fc_stats->bulk_query_len < get_max_bulk_query_len(dev) && + fc_stats->num_counters > get_init_bulk_query_len(dev)) + mlx5_fc_stats_bulk_query_buf_realloc(dev, get_max_bulk_query_len(dev)); + last = list_last_entry(&fc_stats->counters, struct mlx5_fc, list); counter = list_first_entry(&fc_stats->counters, struct mlx5_fc, @@ -453,12 +442,11 @@ EXPORT_SYMBOL(mlx5_fc_destroy); int mlx5_init_fc_stats(struct mlx5_core_dev *dev) { struct mlx5_fc_stats *fc_stats; - int init_bulk_len; - int init_out_len; fc_stats = kzalloc(sizeof(*fc_stats), GFP_KERNEL); if (!fc_stats) return -ENOMEM; + dev->priv.fc_stats = fc_stats; spin_lock_init(&fc_stats->counters_idr_lock); idr_init(&fc_stats->counters_idr); @@ -466,12 +454,10 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev) init_llist_head(&fc_stats->addlist); init_llist_head(&fc_stats->dellist); - init_bulk_len = get_init_bulk_query_len(dev); - init_out_len = mlx5_cmd_fc_get_bulk_query_out_len(init_bulk_len); - fc_stats->bulk_query_out = kzalloc(init_out_len, GFP_KERNEL); + /* Allocate initial (small) bulk query buffer. */ + mlx5_fc_stats_bulk_query_buf_realloc(dev, get_init_bulk_query_len(dev)); if (!fc_stats->bulk_query_out) goto err_bulk; - fc_stats->bulk_query_len = init_bulk_len; fc_stats->wq = create_singlethread_workqueue("mlx5_fc"); if (!fc_stats->wq) @@ -481,12 +467,11 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev) INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work); mlx5_fc_pool_init(&fc_stats->fc_pool, dev); - dev->priv.fc_stats = fc_stats; return 0; err_wq_create: - kfree(fc_stats->bulk_query_out); + kvfree(fc_stats->bulk_query_out); err_bulk: kfree(fc_stats); return -ENOMEM; @@ -512,7 +497,7 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev) mlx5_fc_pool_cleanup(&fc_stats->fc_pool); idr_destroy(&fc_stats->counters_idr); - kfree(fc_stats->bulk_query_out); + kvfree(fc_stats->bulk_query_out); kfree(fc_stats); } From 918af0219a4d6a89cf02839005ede24e91f13bf6 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 1 Oct 2024 13:37:06 +0300 Subject: [PATCH 3/6] net/mlx5: hw counters: Replace IDR+lists with xarray Previously, managing counters was a complicated affair involving an IDR, a sorted double linked list, two single linked lists and a complex dance between a non-periodic wq task and users adding/deleting counters. Adding was done by inserting new counters into the IDR and into a single linked list, leaving the wq to process the list and actually add the counters into the double linked list, maintained sorted with the IDR. Deleting involved adding the counter into another single linked list, leaving the wq to actually unlink the counter from the other structures and release it. Dumping the counters is done with the bulk query API, which relies on the counter list being sorted and unmutable during querying to efficiently retrieve cached counter values. Finally, the IDR data struct is deprecated. This commit replaces all of that with an xarray. Adding is now done directly, by using xa_lock. Deleting is also done directly, under the xa_lock. Querying is done from a periodic task running every sampling_interval (default 1s) and uses the bulk query API for efficiency. It works by iterating over the xarray: - when a new bulk needs to be started, the bulk information is computed under the xa_lock. - the xa iteration state is saved and the xa_lock dropped. - the HW is queried for bulk counter values. - the xa_lock is reacquired. - counter caches with ids covered by the bulk response are updated. Querying always requests the max bulk length, for simplicity. Counters could be added/deleted while the HW is queried. This is safe, as the HW API simply returns unknown values for counters not in HW, but those values won't be accessed. Only counters present in xarray before bulk query will actually read queried cache values. This cuts down the size of mlx5_fc by 4 pointers (88->56 bytes), which amounts to ~3MB / 100K counters. But more importantly, this solves the wq spinlock congestion issue seen happening on high-rate counter insertion+deletion. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241001103709.58127-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/fs_counters.c | 276 ++++++------------ 1 file changed, 82 insertions(+), 194 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 9892895da9ee21..05d9351ff57716 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -32,7 +32,6 @@ #include #include -#include #include "mlx5_core.h" #include "fs_core.h" #include "fs_cmd.h" @@ -51,21 +50,13 @@ struct mlx5_fc_cache { }; struct mlx5_fc { - struct list_head list; - struct llist_node addlist; - struct llist_node dellist; - - /* last{packets,bytes} members are used when calculating the delta since - * last reading - */ - u64 lastpackets; - u64 lastbytes; - - struct mlx5_fc_bulk *bulk; u32 id; bool aging; - + struct mlx5_fc_bulk *bulk; struct mlx5_fc_cache cache ____cacheline_aligned_in_smp; + /* last{packets,bytes} are used for calculating deltas since last reading. */ + u64 lastpackets; + u64 lastbytes; }; struct mlx5_fc_pool { @@ -80,19 +71,14 @@ struct mlx5_fc_pool { }; struct mlx5_fc_stats { - spinlock_t counters_idr_lock; /* protects counters_idr */ - struct idr counters_idr; - struct list_head counters; - struct llist_head addlist; - struct llist_head dellist; + struct xarray counters; struct workqueue_struct *wq; struct delayed_work work; - unsigned long next_query; unsigned long sampling_interval; /* jiffies */ u32 *bulk_query_out; int bulk_query_len; - size_t num_counters; + size_t num_counters; /* Also protected by xarray->xa_lock. */ bool bulk_query_alloc_failed; unsigned long next_bulk_query_alloc; struct mlx5_fc_pool fc_pool; @@ -103,78 +89,6 @@ static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool); static struct mlx5_fc *mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool); static void mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct mlx5_fc *fc); -/* locking scheme: - * - * It is the responsibility of the user to prevent concurrent calls or bad - * ordering to mlx5_fc_create(), mlx5_fc_destroy() and accessing a reference - * to struct mlx5_fc. - * e.g en_tc.c is protected by RTNL lock of its caller, and will never call a - * dump (access to struct mlx5_fc) after a counter is destroyed. - * - * access to counter list: - * - create (user context) - * - mlx5_fc_create() only adds to an addlist to be used by - * mlx5_fc_stats_work(). addlist is a lockless single linked list - * that doesn't require any additional synchronization when adding single - * node. - * - spawn thread to do the actual destroy - * - * - destroy (user context) - * - add a counter to lockless dellist - * - spawn thread to do the actual del - * - * - dump (user context) - * user should not call dump after destroy - * - * - query (single thread workqueue context) - * destroy/dump - no conflict (see destroy) - * query/dump - packets and bytes might be inconsistent (since update is not - * atomic) - * query/create - no conflict (see create) - * since every create/destroy spawn the work, only after necessary time has - * elapsed, the thread will actually query the hardware. - */ - -static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev, - u32 id) -{ - struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; - unsigned long next_id = (unsigned long)id + 1; - struct mlx5_fc *counter; - unsigned long tmp; - - rcu_read_lock(); - /* skip counters that are in idr, but not yet in counters list */ - idr_for_each_entry_continue_ul(&fc_stats->counters_idr, - counter, tmp, next_id) { - if (!list_empty(&counter->list)) - break; - } - rcu_read_unlock(); - - return counter ? &counter->list : &fc_stats->counters; -} - -static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev, - struct mlx5_fc *counter) -{ - struct list_head *next = mlx5_fc_counters_lookup_next(dev, counter->id); - - list_add_tail(&counter->list, next); -} - -static void mlx5_fc_stats_remove(struct mlx5_core_dev *dev, - struct mlx5_fc *counter) -{ - struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; - - list_del(&counter->list); - - spin_lock(&fc_stats->counters_idr_lock); - WARN_ON(!idr_remove(&fc_stats->counters_idr, counter->id)); - spin_unlock(&fc_stats->counters_idr_lock); -} - static int get_init_bulk_query_len(struct mlx5_core_dev *dev) { return min_t(int, MLX5_INIT_COUNTERS_BULK, @@ -203,47 +117,64 @@ static void update_counter_cache(int index, u32 *bulk_raw_data, cache->lastuse = jiffies; } -static void mlx5_fc_stats_query_counter_range(struct mlx5_core_dev *dev, - struct mlx5_fc *first, - u32 last_id) +/* Synchronization notes + * + * Access to counter array: + * - create - mlx5_fc_create() (user context) + * - inserts the counter into the xarray. + * + * - destroy - mlx5_fc_destroy() (user context) + * - erases the counter from the xarray and releases it. + * + * - query mlx5_fc_query(), mlx5_fc_query_cached{,_raw}() (user context) + * - user should not access a counter after destroy. + * + * - bulk query (single thread workqueue context) + * - create: query relies on 'lastuse' to avoid updating counters added + * around the same time as the current bulk cmd. + * - destroy: destroyed counters will not be accessed, even if they are + * destroyed during a bulk query command. + */ +static void mlx5_fc_stats_query_all_counters(struct mlx5_core_dev *dev) { struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; - bool query_more_counters = (first->id <= last_id); - int cur_bulk_len = fc_stats->bulk_query_len; + u32 bulk_len = fc_stats->bulk_query_len; + XA_STATE(xas, &fc_stats->counters, 0); u32 *data = fc_stats->bulk_query_out; - struct mlx5_fc *counter = first; + struct mlx5_fc *counter; + u32 last_bulk_id = 0; + u64 bulk_query_time; u32 bulk_base_id; - int bulk_len; int err; - while (query_more_counters) { - /* first id must be aligned to 4 when using bulk query */ - bulk_base_id = counter->id & ~0x3; - - /* number of counters to query inc. the last counter */ - bulk_len = min_t(int, cur_bulk_len, - ALIGN(last_id - bulk_base_id + 1, 4)); - - err = mlx5_cmd_fc_bulk_query(dev, bulk_base_id, bulk_len, - data); - if (err) { - mlx5_core_err(dev, "Error doing bulk query: %d\n", err); - return; - } - query_more_counters = false; - - list_for_each_entry_from(counter, &fc_stats->counters, list) { - int counter_index = counter->id - bulk_base_id; - struct mlx5_fc_cache *cache = &counter->cache; - - if (counter->id >= bulk_base_id + bulk_len) { - query_more_counters = true; - break; + xas_lock(&xas); + xas_for_each(&xas, counter, U32_MAX) { + if (xas_retry(&xas, counter)) + continue; + if (unlikely(counter->id >= last_bulk_id)) { + /* Start new bulk query. */ + /* First id must be aligned to 4 when using bulk query. */ + bulk_base_id = counter->id & ~0x3; + last_bulk_id = bulk_base_id + bulk_len; + /* The lock is released while querying the hw and reacquired after. */ + xas_unlock(&xas); + /* The same id needs to be processed again in the next loop iteration. */ + xas_reset(&xas); + bulk_query_time = jiffies; + err = mlx5_cmd_fc_bulk_query(dev, bulk_base_id, bulk_len, data); + if (err) { + mlx5_core_err(dev, "Error doing bulk query: %d\n", err); + return; } - - update_counter_cache(counter_index, data, cache); + xas_lock(&xas); + continue; } + /* Do not update counters added after bulk query was started. */ + if (time_after64(bulk_query_time, counter->cache.lastuse)) + update_counter_cache(counter->id - bulk_base_id, data, + &counter->cache); } + xas_unlock(&xas); } static void mlx5_fc_free(struct mlx5_core_dev *dev, struct mlx5_fc *counter) @@ -291,46 +222,19 @@ static void mlx5_fc_stats_work(struct work_struct *work) struct mlx5_fc_stats *fc_stats = container_of(work, struct mlx5_fc_stats, work.work); struct mlx5_core_dev *dev = fc_stats->fc_pool.dev; - /* Take dellist first to ensure that counters cannot be deleted before - * they are inserted. - */ - struct llist_node *dellist = llist_del_all(&fc_stats->dellist); - struct llist_node *addlist = llist_del_all(&fc_stats->addlist); - struct mlx5_fc *counter = NULL, *last = NULL, *tmp; - unsigned long now = jiffies; - - if (addlist || !list_empty(&fc_stats->counters)) - queue_delayed_work(fc_stats->wq, &fc_stats->work, - fc_stats->sampling_interval); - - llist_for_each_entry(counter, addlist, addlist) { - mlx5_fc_stats_insert(dev, counter); - fc_stats->num_counters++; - } + int num_counters; - llist_for_each_entry_safe(counter, tmp, dellist, dellist) { - mlx5_fc_stats_remove(dev, counter); - - mlx5_fc_release(dev, counter); - fc_stats->num_counters--; - } - - if (time_before(now, fc_stats->next_query) || - list_empty(&fc_stats->counters)) - return; + queue_delayed_work(fc_stats->wq, &fc_stats->work, fc_stats->sampling_interval); + /* num_counters is only needed for determining whether to increase the buffer. */ + xa_lock(&fc_stats->counters); + num_counters = fc_stats->num_counters; + xa_unlock(&fc_stats->counters); if (fc_stats->bulk_query_len < get_max_bulk_query_len(dev) && - fc_stats->num_counters > get_init_bulk_query_len(dev)) + num_counters > get_init_bulk_query_len(dev)) mlx5_fc_stats_bulk_query_buf_realloc(dev, get_max_bulk_query_len(dev)); - last = list_last_entry(&fc_stats->counters, struct mlx5_fc, list); - - counter = list_first_entry(&fc_stats->counters, struct mlx5_fc, - list); - if (counter) - mlx5_fc_stats_query_counter_range(dev, counter, last->id); - - fc_stats->next_query = now + fc_stats->sampling_interval; + mlx5_fc_stats_query_all_counters(dev); } static struct mlx5_fc *mlx5_fc_single_alloc(struct mlx5_core_dev *dev) @@ -374,7 +278,6 @@ struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging) if (IS_ERR(counter)) return counter; - INIT_LIST_HEAD(&counter->list); counter->aging = aging; if (aging) { @@ -384,18 +287,15 @@ struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging) counter->lastbytes = counter->cache.bytes; counter->lastpackets = counter->cache.packets; - idr_preload(GFP_KERNEL); - spin_lock(&fc_stats->counters_idr_lock); + xa_lock(&fc_stats->counters); - err = idr_alloc_u32(&fc_stats->counters_idr, counter, &id, id, - GFP_NOWAIT); - - spin_unlock(&fc_stats->counters_idr_lock); - idr_preload_end(); - if (err) + err = xa_err(__xa_store(&fc_stats->counters, id, counter, GFP_KERNEL)); + if (err != 0) { + xa_unlock(&fc_stats->counters); goto err_out_alloc; - - llist_add(&counter->addlist, &fc_stats->addlist); + } + fc_stats->num_counters++; + xa_unlock(&fc_stats->counters); } return counter; @@ -407,12 +307,7 @@ struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging) struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging) { - struct mlx5_fc *counter = mlx5_fc_create_ex(dev, aging); - struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; - - if (aging) - mod_delayed_work(fc_stats->wq, &fc_stats->work, 0); - return counter; + return mlx5_fc_create_ex(dev, aging); } EXPORT_SYMBOL(mlx5_fc_create); @@ -430,11 +325,11 @@ void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter) return; if (counter->aging) { - llist_add(&counter->dellist, &fc_stats->dellist); - mod_delayed_work(fc_stats->wq, &fc_stats->work, 0); - return; + xa_lock(&fc_stats->counters); + fc_stats->num_counters--; + __xa_erase(&fc_stats->counters, counter->id); + xa_unlock(&fc_stats->counters); } - mlx5_fc_release(dev, counter); } EXPORT_SYMBOL(mlx5_fc_destroy); @@ -448,11 +343,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev) return -ENOMEM; dev->priv.fc_stats = fc_stats; - spin_lock_init(&fc_stats->counters_idr_lock); - idr_init(&fc_stats->counters_idr); - INIT_LIST_HEAD(&fc_stats->counters); - init_llist_head(&fc_stats->addlist); - init_llist_head(&fc_stats->dellist); + xa_init(&fc_stats->counters); /* Allocate initial (small) bulk query buffer. */ mlx5_fc_stats_bulk_query_buf_realloc(dev, get_init_bulk_query_len(dev)); @@ -467,7 +358,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev) INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work); mlx5_fc_pool_init(&fc_stats->fc_pool, dev); - + queue_delayed_work(fc_stats->wq, &fc_stats->work, MLX5_FC_STATS_PERIOD); return 0; err_wq_create: @@ -480,23 +371,20 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev) void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev) { struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; - struct llist_node *tmplist; struct mlx5_fc *counter; - struct mlx5_fc *tmp; + unsigned long id; cancel_delayed_work_sync(&fc_stats->work); destroy_workqueue(fc_stats->wq); fc_stats->wq = NULL; - tmplist = llist_del_all(&fc_stats->addlist); - llist_for_each_entry_safe(counter, tmp, tmplist, addlist) - mlx5_fc_release(dev, counter); - - list_for_each_entry_safe(counter, tmp, &fc_stats->counters, list) + xa_for_each(&fc_stats->counters, id, counter) { + xa_erase(&fc_stats->counters, id); mlx5_fc_release(dev, counter); + } + xa_destroy(&fc_stats->counters); mlx5_fc_pool_cleanup(&fc_stats->fc_pool); - idr_destroy(&fc_stats->counters_idr); kvfree(fc_stats->bulk_query_out); kfree(fc_stats); } From d95f77f1196a9458f61a08faa0eb569cf6f03a84 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 1 Oct 2024 13:37:07 +0300 Subject: [PATCH 4/6] net/mlx5: hw counters: Drop unneeded cacheline alignment The mlx5_fc struct has a cache for values queried from hw, which is cacheline aligned. On x86_64, this results in: struct mlx5_fc { u32 id; /* 0 4 */ bool aging; /* 4 1 */ /* XXX 3 bytes hole, try to pack */ struct mlx5_fc_bulk * bulk; /* 8 8 */ /* XXX 48 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ struct mlx5_fc_cache cache __attribute__((__aligned__(64))); /* 64 24 */ u64 lastpackets; /* 88 8 */ u64 lastbytes; /* 96 8 */ /* size: 128, cachelines: 2, members: 6 */ /* sum members: 53, holes: 2, sum holes: 51 */ /* padding: 24 */ /* forced aligns: 1, forced holes: 1, sum forced holes: 48 */ } __attribute__((__aligned__(64))); (output from pahole). ...So a 48+24=72 byte waste. As far as I can determine, this serves no purpose other than maybe making sure that the values in the cache do not span two cachelines in the worst case scenario, but that's not a valid enough reason to waste 72 bytes per counter, especially since this code is not performance-critical. There could potentially be hundreds of thousands of counters (e.g. for connection-tracking), so this quickly adds up to multiple MB wasted. This commit removes the alignment, resulting in: struct mlx5_fc { [...] /* size: 56, cachelines: 1, members: 6 */ /* sum members: 53, holes: 1, sum holes: 3 */ /* last cacheline: 56 bytes */ }; Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241001103709.58127-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 05d9351ff57716..ef13941e55c20b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -53,7 +53,7 @@ struct mlx5_fc { u32 id; bool aging; struct mlx5_fc_bulk *bulk; - struct mlx5_fc_cache cache ____cacheline_aligned_in_smp; + struct mlx5_fc_cache cache; /* last{packets,bytes} are used for calculating deltas since last reading. */ u64 lastpackets; u64 lastbytes; From 4a67ebf85f384853970c56099bfc3096f66c20ed Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 1 Oct 2024 13:37:08 +0300 Subject: [PATCH 5/6] net/mlx5: hw counters: Don't maintain a counter count num_counters is only used for deciding whether to grow the bulk query buffer, which is done once more counters than a small initial threshold are present. After that, maintaining num_counters serves no purpose. This commit replaces that with an actual xarray traversal to count the counters. This appears expensive at first sight, but is only done when the number of counters is less than the initial threshold (8) and only once every sampling interval. Once the number of counters goes above the threshold, the bulk query buffer is grown to max size and the xarray traversal is never done again. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241001103709.58127-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/fs_counters.c | 40 +++++++++---------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index ef13941e55c20b..0b80c33cba5fd3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -78,7 +78,6 @@ struct mlx5_fc_stats { unsigned long sampling_interval; /* jiffies */ u32 *bulk_query_out; int bulk_query_len; - size_t num_counters; /* Also protected by xarray->xa_lock. */ bool bulk_query_alloc_failed; unsigned long next_bulk_query_alloc; struct mlx5_fc_pool fc_pool; @@ -217,21 +216,28 @@ static void mlx5_fc_stats_bulk_query_buf_realloc(struct mlx5_core_dev *dev, bulk_query_len); } +static int mlx5_fc_num_counters(struct mlx5_fc_stats *fc_stats) +{ + struct mlx5_fc *counter; + int num_counters = 0; + unsigned long id; + + xa_for_each(&fc_stats->counters, id, counter) + num_counters++; + return num_counters; +} + static void mlx5_fc_stats_work(struct work_struct *work) { struct mlx5_fc_stats *fc_stats = container_of(work, struct mlx5_fc_stats, work.work); struct mlx5_core_dev *dev = fc_stats->fc_pool.dev; - int num_counters; queue_delayed_work(fc_stats->wq, &fc_stats->work, fc_stats->sampling_interval); - /* num_counters is only needed for determining whether to increase the buffer. */ - xa_lock(&fc_stats->counters); - num_counters = fc_stats->num_counters; - xa_unlock(&fc_stats->counters); - if (fc_stats->bulk_query_len < get_max_bulk_query_len(dev) && - num_counters > get_init_bulk_query_len(dev)) + /* Grow the bulk query buffer to max if not maxed and enough counters are present. */ + if (unlikely(fc_stats->bulk_query_len < get_max_bulk_query_len(dev) && + mlx5_fc_num_counters(fc_stats) > get_init_bulk_query_len(dev))) mlx5_fc_stats_bulk_query_buf_realloc(dev, get_max_bulk_query_len(dev)); mlx5_fc_stats_query_all_counters(dev); @@ -287,15 +293,9 @@ struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging) counter->lastbytes = counter->cache.bytes; counter->lastpackets = counter->cache.packets; - xa_lock(&fc_stats->counters); - - err = xa_err(__xa_store(&fc_stats->counters, id, counter, GFP_KERNEL)); - if (err != 0) { - xa_unlock(&fc_stats->counters); + err = xa_err(xa_store(&fc_stats->counters, id, counter, GFP_KERNEL)); + if (err != 0) goto err_out_alloc; - } - fc_stats->num_counters++; - xa_unlock(&fc_stats->counters); } return counter; @@ -324,12 +324,8 @@ void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter) if (!counter) return; - if (counter->aging) { - xa_lock(&fc_stats->counters); - fc_stats->num_counters--; - __xa_erase(&fc_stats->counters, counter->id); - xa_unlock(&fc_stats->counters); - } + if (counter->aging) + xa_erase(&fc_stats->counters, counter->id); mlx5_fc_release(dev, counter); } EXPORT_SYMBOL(mlx5_fc_destroy); From d1c9cffe4b01f4d8bc52169139a5fedd48908abc Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 1 Oct 2024 13:37:09 +0300 Subject: [PATCH 6/6] net/mlx5: hw counters: Remove mlx5_fc_create_ex It no longer serves any purpose and is identical to mlx5_fc_create upon which it was originally based of. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241001103709.58127-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 7 +------ include/linux/mlx5/fs.h | 3 --- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index dcfccaaa8d917b..4877a9d86807ac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -1026,7 +1026,7 @@ mlx5_tc_ct_counter_create(struct mlx5_tc_ct_priv *ct_priv) return ERR_PTR(-ENOMEM); counter->is_shared = false; - counter->counter = mlx5_fc_create_ex(ct_priv->dev, true); + counter->counter = mlx5_fc_create(ct_priv->dev, true); if (IS_ERR(counter->counter)) { ct_dbg("Failed to create counter for ct entry"); ret = PTR_ERR(counter->counter); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 0b80c33cba5fd3..62d0c689796b97 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -275,7 +275,7 @@ static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging) return mlx5_fc_single_alloc(dev); } -struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging) +struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging) { struct mlx5_fc *counter = mlx5_fc_acquire(dev, aging); struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; @@ -304,11 +304,6 @@ struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging) mlx5_fc_release(dev, counter); return ERR_PTR(err); } - -struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging) -{ - return mlx5_fc_create_ex(dev, aging); -} EXPORT_SYMBOL(mlx5_fc_create); u32 mlx5_fc_id(struct mlx5_fc *counter) diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index b744e554f014d1..438db888bde0d6 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -298,9 +298,6 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); -/* As mlx5_fc_create() but doesn't queue stats refresh thread. */ -struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging); - void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter,