Skip to content

Commit

Permalink
net/mlx5: Implement thermal zone
Browse files Browse the repository at this point in the history
Implement thermal zone support for mlx5 based HW. The NIC
uses temperature sensor provided by ASIC to report current temperature
to thermal core.

Signed-off-by: Sandipan Patra <[email protected]>
Reviewed-by: Gal Pressman <[email protected]>
Signed-off-by: Saeed Mahameed <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jakub Kicinski <[email protected]>
  • Loading branch information
Sandipan Patra authored and kuba-moo committed Mar 16, 2023
1 parent ceefcfb commit c1fef61
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 0 deletions.
1 change: 1 addition & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH) += esw/acl/helper.o \

mlx5_core-$(CONFIG_MLX5_BRIDGE) += esw/bridge.o en/rep/bridge.o

mlx5_core-$(CONFIG_THERMAL) += thermal.o
mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o
mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
Expand Down
6 changes: 6 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#include <linux/version.h>
#include <net/devlink.h>
#include "mlx5_core.h"
#include "thermal.h"
#include "lib/eq.h"
#include "fs_core.h"
#include "lib/mpfs.h"
Expand Down Expand Up @@ -1768,6 +1769,10 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
if (err)
dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);

err = mlx5_thermal_init(dev);
if (err)
dev_err(&pdev->dev, "mlx5_thermal_init failed with error code %d\n", err);

pci_save_state(pdev);
devlink_register(devlink);
return 0;
Expand Down Expand Up @@ -1796,6 +1801,7 @@ static void remove_one(struct pci_dev *pdev)
set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state);
devlink_unregister(devlink);
mlx5_sriov_disable(pdev);
mlx5_thermal_uninit(dev);
mlx5_crdump_disable(dev);
mlx5_drain_health_wq(dev);
mlx5_uninit_one(dev);
Expand Down
108 changes: 108 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/thermal.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.

#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/device.h>
#include <linux/thermal.h>
#include <linux/err.h>
#include <linux/mlx5/driver.h>
#include "mlx5_core.h"
#include "thermal.h"

#define MLX5_THERMAL_POLL_INT_MSEC 1000
#define MLX5_THERMAL_NUM_TRIPS 0
#define MLX5_THERMAL_ASIC_SENSOR_INDEX 0

/* Bit string indicating the writeablility of trip points if any */
#define MLX5_THERMAL_TRIP_MASK (BIT(MLX5_THERMAL_NUM_TRIPS) - 1)

struct mlx5_thermal {
struct mlx5_core_dev *mdev;
struct thermal_zone_device *tzdev;
};

static int mlx5_thermal_get_mtmp_temp(struct mlx5_core_dev *mdev, u32 id, int *p_temp)
{
u32 mtmp_out[MLX5_ST_SZ_DW(mtmp_reg)] = {};
u32 mtmp_in[MLX5_ST_SZ_DW(mtmp_reg)] = {};
int err;

MLX5_SET(mtmp_reg, mtmp_in, sensor_index, id);

err = mlx5_core_access_reg(mdev, mtmp_in, sizeof(mtmp_in),
mtmp_out, sizeof(mtmp_out),
MLX5_REG_MTMP, 0, 0);

if (err)
return err;

*p_temp = MLX5_GET(mtmp_reg, mtmp_out, temperature);

return 0;
}

static int mlx5_thermal_get_temp(struct thermal_zone_device *tzdev,
int *p_temp)
{
struct mlx5_thermal *thermal = tzdev->devdata;
struct mlx5_core_dev *mdev = thermal->mdev;
int err;

err = mlx5_thermal_get_mtmp_temp(mdev, MLX5_THERMAL_ASIC_SENSOR_INDEX, p_temp);

if (err)
return err;

/* The unit of temp returned is in 0.125 C. The thermal
* framework expects the value in 0.001 C.
*/
*p_temp *= 125;

return 0;
}

static struct thermal_zone_device_ops mlx5_thermal_ops = {
.get_temp = mlx5_thermal_get_temp,
};

int mlx5_thermal_init(struct mlx5_core_dev *mdev)
{
struct mlx5_thermal *thermal;
struct thermal_zone_device *tzd;
const char *data = "mlx5";

tzd = thermal_zone_get_zone_by_name(data);
if (!IS_ERR(tzd))
return 0;

thermal = kzalloc(sizeof(*thermal), GFP_KERNEL);
if (!thermal)
return -ENOMEM;

thermal->mdev = mdev;
thermal->tzdev = thermal_zone_device_register(data,
MLX5_THERMAL_NUM_TRIPS,
MLX5_THERMAL_TRIP_MASK,
thermal,
&mlx5_thermal_ops,
NULL, 0, MLX5_THERMAL_POLL_INT_MSEC);
if (IS_ERR(thermal->tzdev)) {
dev_err(mdev->device, "Failed to register thermal zone device (%s) %ld\n",
data, PTR_ERR(thermal->tzdev));
kfree(thermal);
return -EINVAL;
}

mdev->thermal = thermal;
return 0;
}

void mlx5_thermal_uninit(struct mlx5_core_dev *mdev)
{
if (!mdev->thermal)
return;

thermal_zone_device_unregister(mdev->thermal->tzdev);
kfree(mdev->thermal);
}
20 changes: 20 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/thermal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
*/
#ifndef __MLX5_THERMAL_DRIVER_H
#define __MLX5_THERMAL_DRIVER_H

#if IS_ENABLED(CONFIG_THERMAL)
int mlx5_thermal_init(struct mlx5_core_dev *mdev);
void mlx5_thermal_uninit(struct mlx5_core_dev *mdev);
#else
static inline int mlx5_thermal_init(struct mlx5_core_dev *mdev)
{
mdev->thermal = NULL;
return 0;
}

static inline void mlx5_thermal_uninit(struct mlx5_core_dev *mdev) { }
#endif

#endif /* __MLX5_THERMAL_DRIVER_H */
3 changes: 3 additions & 0 deletions include/linux/mlx5/driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ enum {
MLX5_REG_PCAM = 0x507f,
MLX5_REG_NODE_DESC = 0x6001,
MLX5_REG_HOST_ENDIANNESS = 0x7004,
MLX5_REG_MTMP = 0x900A,
MLX5_REG_MCIA = 0x9014,
MLX5_REG_MFRL = 0x9028,
MLX5_REG_MLCR = 0x902b,
Expand Down Expand Up @@ -731,6 +732,7 @@ struct mlx5_fw_tracer;
struct mlx5_vxlan;
struct mlx5_geneve;
struct mlx5_hv_vhca;
struct mlx5_thermal;

#define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
#define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
Expand Down Expand Up @@ -808,6 +810,7 @@ struct mlx5_core_dev {
struct mlx5_rsc_dump *rsc_dump;
u32 vsc_addr;
struct mlx5_hv_vhca *hv_vhca;
struct mlx5_thermal *thermal;
};

struct mlx5_db {
Expand Down
26 changes: 26 additions & 0 deletions include/linux/mlx5/mlx5_ifc.h
Original file line number Diff line number Diff line change
Expand Up @@ -10869,6 +10869,31 @@ struct mlx5_ifc_mrtc_reg_bits {
u8 time_l[0x20];
};

struct mlx5_ifc_mtmp_reg_bits {
u8 reserved_at_0[0x14];
u8 sensor_index[0xc];

u8 reserved_at_20[0x10];
u8 temperature[0x10];

u8 mte[0x1];
u8 mtr[0x1];
u8 reserved_at_42[0xe];
u8 max_temperature[0x10];

u8 tee[0x2];
u8 reserved_at_62[0xe];
u8 temp_threshold_hi[0x10];

u8 reserved_at_80[0x10];
u8 temp_threshold_lo[0x10];

u8 reserved_at_a0[0x20];

u8 sensor_name_hi[0x20];
u8 sensor_name_lo[0x20];
};

union mlx5_ifc_ports_control_registers_document_bits {
struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
Expand Down Expand Up @@ -10931,6 +10956,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
struct mlx5_ifc_mfrl_reg_bits mfrl_reg;
struct mlx5_ifc_mtutc_reg_bits mtutc_reg;
struct mlx5_ifc_mrtc_reg_bits mrtc_reg;
struct mlx5_ifc_mtmp_reg_bits mtmp_reg;
u8 reserved_at_0[0x60e0];
};

Expand Down

0 comments on commit c1fef61

Please sign in to comment.