From 4d080a029db1b2ff7e173d86886b3429596f3c63 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 28 Aug 2024 07:35:11 +0000 Subject: [PATCH 01/29] rust: sizes: add commonly used constants Add rust equivalent to include/linux/sizes.h, makes code more readable. Only SZ_*K that QT2025 PHY driver uses are added. Make generated constants accessible with a proper type. Reviewed-by: Alice Ryhl Reviewed-by: Andrew Lunn Reviewed-by: Benno Lossin Reviewed-by: Trevor Gross Signed-off-by: FUJITA Tomonori Signed-off-by: David S. Miller --- rust/kernel/lib.rs | 1 + rust/kernel/sizes.rs | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 rust/kernel/sizes.rs diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 274bdc1b0a824a..58ed400198bf2e 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -43,6 +43,7 @@ pub mod net; pub mod page; pub mod prelude; pub mod print; +pub mod sizes; mod static_assert; #[doc(hidden)] pub mod std_vendor; diff --git a/rust/kernel/sizes.rs b/rust/kernel/sizes.rs new file mode 100644 index 00000000000000..834c343e4170f5 --- /dev/null +++ b/rust/kernel/sizes.rs @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Commonly used sizes. +//! +//! C headers: [`include/linux/sizes.h`](srctree/include/linux/sizes.h). + +/// 0x00000400 +pub const SZ_1K: usize = bindings::SZ_1K as usize; +/// 0x00000800 +pub const SZ_2K: usize = bindings::SZ_2K as usize; +/// 0x00001000 +pub const SZ_4K: usize = bindings::SZ_4K as usize; +/// 0x00002000 +pub const SZ_8K: usize = bindings::SZ_8K as usize; +/// 0x00004000 +pub const SZ_16K: usize = bindings::SZ_16K as usize; +/// 0x00008000 +pub const SZ_32K: usize = bindings::SZ_32K as usize; +/// 0x00010000 +pub const SZ_64K: usize = bindings::SZ_64K as usize; +/// 0x00020000 +pub const SZ_128K: usize = bindings::SZ_128K as usize; +/// 0x00040000 +pub const SZ_256K: usize = bindings::SZ_256K as usize; +/// 0x00080000 +pub const SZ_512K: usize = bindings::SZ_512K as usize; From ffd2747de6ab1545883bffe23f24e60625c1f455 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 28 Aug 2024 07:35:12 +0000 Subject: [PATCH 02/29] rust: net::phy support probe callback Support phy_driver probe callback, used to set up device-specific structures. Reviewed-by: Alice Ryhl Reviewed-by: Andrew Lunn Reviewed-by: Benno Lossin Reviewed-by: Trevor Gross Signed-off-by: FUJITA Tomonori Signed-off-by: David S. Miller --- rust/kernel/net/phy.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index fd40b703d2244f..5e8137a1972f9b 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -338,6 +338,21 @@ impl Adapter { }) } + /// # Safety + /// + /// `phydev` must be passed by the corresponding callback in `phy_driver`. + unsafe extern "C" fn probe_callback(phydev: *mut bindings::phy_device) -> core::ffi::c_int { + from_result(|| { + // SAFETY: This callback is called only in contexts + // where we can exclusively access `phy_device` because + // it's not published yet, so the accessors on `Device` are okay + // to call. + let dev = unsafe { Device::from_raw(phydev) }; + T::probe(dev)?; + Ok(0) + }) + } + /// # Safety /// /// `phydev` must be passed by the corresponding callback in `phy_driver`. @@ -511,6 +526,11 @@ pub const fn create_phy_driver() -> DriverVTable { } else { None }, + probe: if T::HAS_PROBE { + Some(Adapter::::probe_callback) + } else { + None + }, get_features: if T::HAS_GET_FEATURES { Some(Adapter::::get_features_callback) } else { @@ -583,6 +603,11 @@ pub trait Driver { kernel::build_error(VTABLE_DEFAULT_ERROR) } + /// Sets up device-specific structures during discovery. + fn probe(_dev: &mut Device) -> Result { + kernel::build_error(VTABLE_DEFAULT_ERROR) + } + /// Probes the hardware to determine what abilities it has. fn get_features(_dev: &mut Device) -> Result { kernel::build_error(VTABLE_DEFAULT_ERROR) From 7909892a9fbb3e60623e60c3c3e95e10fc56f687 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 28 Aug 2024 07:35:13 +0000 Subject: [PATCH 03/29] rust: net::phy implement AsRef trait Implement AsRef trait for Device. A PHY driver needs a reference to device::Device to call the firmware API. Reviewed-by: Alice Ryhl Reviewed-by: Andrew Lunn Reviewed-by: Benno Lossin Reviewed-by: Trevor Gross Signed-off-by: FUJITA Tomonori Signed-off-by: David S. Miller --- rust/kernel/net/phy.rs | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index 5e8137a1972f9b..b16e8c10a0a2c6 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -7,8 +7,7 @@ //! C headers: [`include/linux/phy.h`](srctree/include/linux/phy.h). use crate::{error::*, prelude::*, types::Opaque}; - -use core::marker::PhantomData; +use core::{marker::PhantomData, ptr::addr_of_mut}; /// PHY state machine states. /// @@ -58,8 +57,9 @@ pub enum DuplexMode { /// /// # Invariants /// -/// Referencing a `phy_device` using this struct asserts that you are in -/// a context where all methods defined on this struct are safe to call. +/// - Referencing a `phy_device` using this struct asserts that you are in +/// a context where all methods defined on this struct are safe to call. +/// - This struct always has a valid `self.0.mdio.dev`. /// /// [`struct phy_device`]: srctree/include/linux/phy.h // During the calls to most functions in [`Driver`], the C side (`PHYLIB`) holds a lock that is @@ -76,9 +76,11 @@ impl Device { /// /// # Safety /// - /// For the duration of 'a, the pointer must point at a valid `phy_device`, - /// and the caller must be in a context where all methods defined on this struct - /// are safe to call. + /// For the duration of `'a`, + /// - the pointer must point at a valid `phy_device`, and the caller + /// must be in a context where all methods defined on this struct + /// are safe to call. + /// - `(*ptr).mdio.dev` must be a valid. unsafe fn from_raw<'a>(ptr: *mut bindings::phy_device) -> &'a mut Self { // CAST: `Self` is a `repr(transparent)` wrapper around `bindings::phy_device`. let ptr = ptr.cast::(); @@ -302,6 +304,14 @@ impl Device { } } +impl AsRef for Device { + fn as_ref(&self) -> &kernel::device::Device { + let phydev = self.0.get(); + // SAFETY: The struct invariant ensures that `mdio.dev` is valid. + unsafe { kernel::device::Device::as_ref(addr_of_mut!((*phydev).mdio.dev)) } + } +} + /// Defines certain other features this PHY supports (like interrupts). /// /// These flag values are used in [`Driver::FLAGS`]. From b2e47002b2350f57bfa8fe1c231e9fbb6baef78b Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 28 Aug 2024 07:35:14 +0000 Subject: [PATCH 04/29] rust: net::phy unified read/write API for C22 and C45 registers Add the unified read/write API for C22 and C45 registers. The abstractions support access to only C22 registers now. Instead of adding read/write_c45 methods specifically for C45, a new reg module supports the unified API to access C22 and C45 registers with trait, by calling an appropriate phylib functions. Reviewed-by: Trevor Gross Reviewed-by: Benno Lossin Reviewed-by: Andrew Lunn Signed-off-by: FUJITA Tomonori Signed-off-by: David S. Miller --- MAINTAINERS | 1 + drivers/net/phy/ax88796b_rust.rs | 7 +- rust/kernel/net/phy.rs | 31 ++--- rust/kernel/net/phy/reg.rs | 196 +++++++++++++++++++++++++++++++ rust/uapi/uapi_helper.h | 1 + 5 files changed, 209 insertions(+), 27 deletions(-) create mode 100644 rust/kernel/net/phy/reg.rs diff --git a/MAINTAINERS b/MAINTAINERS index 289a6b5615cea7..66107b9960aaf3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8357,6 +8357,7 @@ L: netdev@vger.kernel.org L: rust-for-linux@vger.kernel.org S: Maintained F: rust/kernel/net/phy.rs +F: rust/kernel/net/phy/reg.rs EXEC & BINFMT API, ELF R: Eric Biederman diff --git a/drivers/net/phy/ax88796b_rust.rs b/drivers/net/phy/ax88796b_rust.rs index 5c92572962dce7..8c7eb009d9fc0f 100644 --- a/drivers/net/phy/ax88796b_rust.rs +++ b/drivers/net/phy/ax88796b_rust.rs @@ -6,7 +6,7 @@ //! C version of this driver: [`drivers/net/phy/ax88796b.c`](./ax88796b.c) use kernel::{ c_str, - net::phy::{self, DeviceId, Driver}, + net::phy::{self, reg::C22, DeviceId, Driver}, prelude::*, uapi, }; @@ -24,7 +24,6 @@ kernel::module_phy_driver! { license: "GPL", } -const MII_BMCR: u16 = uapi::MII_BMCR as u16; const BMCR_SPEED100: u16 = uapi::BMCR_SPEED100 as u16; const BMCR_FULLDPLX: u16 = uapi::BMCR_FULLDPLX as u16; @@ -33,7 +32,7 @@ const BMCR_FULLDPLX: u16 = uapi::BMCR_FULLDPLX as u16; // Toggle BMCR_RESET bit off to accommodate broken AX8796B PHY implementation // such as used on the Individual Computers' X-Surf 100 Zorro card. fn asix_soft_reset(dev: &mut phy::Device) -> Result { - dev.write(uapi::MII_BMCR as u16, 0)?; + dev.write(C22::BMCR, 0)?; dev.genphy_soft_reset() } @@ -55,7 +54,7 @@ impl Driver for PhyAX88772A { } // If MII_LPA is 0, phy_resolve_aneg_linkmode() will fail to resolve // linkmode so use MII_BMCR as default values. - let ret = dev.read(MII_BMCR)?; + let ret = dev.read(C22::BMCR)?; if ret & BMCR_SPEED100 != 0 { dev.set_speed(uapi::SPEED_100); diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index b16e8c10a0a2c6..45866db14c76cc 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -9,6 +9,8 @@ use crate::{error::*, prelude::*, types::Opaque}; use core::{marker::PhantomData, ptr::addr_of_mut}; +pub mod reg; + /// PHY state machine states. /// /// Corresponds to the kernel's [`enum phy_state`]. @@ -177,32 +179,15 @@ impl Device { unsafe { (*phydev).duplex = v }; } - /// Reads a given C22 PHY register. + /// Reads a PHY register. // This function reads a hardware register and updates the stats so takes `&mut self`. - pub fn read(&mut self, regnum: u16) -> Result { - let phydev = self.0.get(); - // SAFETY: `phydev` is pointing to a valid object by the type invariant of `Self`. - // So it's just an FFI call, open code of `phy_read()` with a valid `phy_device` pointer - // `phydev`. - let ret = unsafe { - bindings::mdiobus_read((*phydev).mdio.bus, (*phydev).mdio.addr, regnum.into()) - }; - if ret < 0 { - Err(Error::from_errno(ret)) - } else { - Ok(ret as u16) - } + pub fn read(&mut self, reg: R) -> Result { + reg.read(self) } - /// Writes a given C22 PHY register. - pub fn write(&mut self, regnum: u16, val: u16) -> Result { - let phydev = self.0.get(); - // SAFETY: `phydev` is pointing to a valid object by the type invariant of `Self`. - // So it's just an FFI call, open code of `phy_write()` with a valid `phy_device` pointer - // `phydev`. - to_result(unsafe { - bindings::mdiobus_write((*phydev).mdio.bus, (*phydev).mdio.addr, regnum.into(), val) - }) + /// Writes a PHY register. + pub fn write(&mut self, reg: R, val: u16) -> Result { + reg.write(self, val) } /// Reads a paged register. diff --git a/rust/kernel/net/phy/reg.rs b/rust/kernel/net/phy/reg.rs new file mode 100644 index 00000000000000..4563737a96759a --- /dev/null +++ b/rust/kernel/net/phy/reg.rs @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024 FUJITA Tomonori + +//! PHY register interfaces. +//! +//! This module provides support for accessing PHY registers in the +//! Ethernet management interface clauses 22 and 45 register namespaces, as +//! defined in IEEE 802.3. + +use super::Device; +use crate::build_assert; +use crate::error::*; +use crate::uapi; + +mod private { + /// Marker that a trait cannot be implemented outside of this crate + pub trait Sealed {} +} + +/// Accesses PHY registers. +/// +/// This trait is used to implement the unified interface to access +/// C22 and C45 PHY registers. +/// +/// # Examples +/// +/// ```ignore +/// fn link_change_notify(dev: &mut Device) { +/// // read C22 BMCR register +/// dev.read(C22::BMCR); +/// // read C45 PMA/PMD control 1 register +/// dev.read(C45::new(Mmd::PMAPMD, 0)); +/// } +/// ``` +pub trait Register: private::Sealed { + /// Reads a PHY register. + fn read(&self, dev: &mut Device) -> Result; + + /// Writes a PHY register. + fn write(&self, dev: &mut Device, val: u16) -> Result; +} + +/// A single MDIO clause 22 register address (5 bits). +#[derive(Copy, Clone, Debug)] +pub struct C22(u8); + +impl C22 { + /// Basic mode control. + pub const BMCR: Self = C22(0x00); + /// Basic mode status. + pub const BMSR: Self = C22(0x01); + /// PHY identifier 1. + pub const PHYSID1: Self = C22(0x02); + /// PHY identifier 2. + pub const PHYSID2: Self = C22(0x03); + /// Auto-negotiation advertisement. + pub const ADVERTISE: Self = C22(0x04); + /// Auto-negotiation link partner base page ability. + pub const LPA: Self = C22(0x05); + /// Auto-negotiation expansion. + pub const EXPANSION: Self = C22(0x06); + /// Auto-negotiation next page transmit. + pub const NEXT_PAGE_TRANSMIT: Self = C22(0x07); + /// Auto-negotiation link partner received next page. + pub const LP_RECEIVED_NEXT_PAGE: Self = C22(0x08); + /// Master-slave control. + pub const MASTER_SLAVE_CONTROL: Self = C22(0x09); + /// Master-slave status. + pub const MASTER_SLAVE_STATUS: Self = C22(0x0a); + /// PSE Control. + pub const PSE_CONTROL: Self = C22(0x0b); + /// PSE Status. + pub const PSE_STATUS: Self = C22(0x0c); + /// MMD Register control. + pub const MMD_CONTROL: Self = C22(0x0d); + /// MMD Register address data. + pub const MMD_DATA: Self = C22(0x0e); + /// Extended status. + pub const EXTENDED_STATUS: Self = C22(0x0f); + + /// Creates a new instance of `C22` with a vendor specific register. + pub const fn vendor_specific() -> Self { + build_assert!( + N > 0x0f && N < 0x20, + "Vendor-specific register address must be between 16 and 31" + ); + C22(N) + } +} + +impl private::Sealed for C22 {} + +impl Register for C22 { + fn read(&self, dev: &mut Device) -> Result { + let phydev = dev.0.get(); + // SAFETY: `phydev` is pointing to a valid object by the type invariant of `Device`. + // So it's just an FFI call, open code of `phy_read()` with a valid `phy_device` pointer + // `phydev`. + let ret = unsafe { + bindings::mdiobus_read((*phydev).mdio.bus, (*phydev).mdio.addr, self.0.into()) + }; + to_result(ret)?; + Ok(ret as u16) + } + + fn write(&self, dev: &mut Device, val: u16) -> Result { + let phydev = dev.0.get(); + // SAFETY: `phydev` is pointing to a valid object by the type invariant of `Device`. + // So it's just an FFI call, open code of `phy_write()` with a valid `phy_device` pointer + // `phydev`. + to_result(unsafe { + bindings::mdiobus_write((*phydev).mdio.bus, (*phydev).mdio.addr, self.0.into(), val) + }) + } +} + +/// A single MDIO clause 45 register device and address. +#[derive(Copy, Clone, Debug)] +pub struct Mmd(u8); + +impl Mmd { + /// Physical Medium Attachment/Dependent. + pub const PMAPMD: Self = Mmd(uapi::MDIO_MMD_PMAPMD as u8); + /// WAN interface sublayer. + pub const WIS: Self = Mmd(uapi::MDIO_MMD_WIS as u8); + /// Physical coding sublayer. + pub const PCS: Self = Mmd(uapi::MDIO_MMD_PCS as u8); + /// PHY Extender sublayer. + pub const PHYXS: Self = Mmd(uapi::MDIO_MMD_PHYXS as u8); + /// DTE Extender sublayer. + pub const DTEXS: Self = Mmd(uapi::MDIO_MMD_DTEXS as u8); + /// Transmission convergence. + pub const TC: Self = Mmd(uapi::MDIO_MMD_TC as u8); + /// Auto negotiation. + pub const AN: Self = Mmd(uapi::MDIO_MMD_AN as u8); + /// Separated PMA (1). + pub const SEPARATED_PMA1: Self = Mmd(8); + /// Separated PMA (2). + pub const SEPARATED_PMA2: Self = Mmd(9); + /// Separated PMA (3). + pub const SEPARATED_PMA3: Self = Mmd(10); + /// Separated PMA (4). + pub const SEPARATED_PMA4: Self = Mmd(11); + /// OFDM PMA/PMD. + pub const OFDM_PMAPMD: Self = Mmd(12); + /// Power unit. + pub const POWER_UNIT: Self = Mmd(13); + /// Clause 22 extension. + pub const C22_EXT: Self = Mmd(uapi::MDIO_MMD_C22EXT as u8); + /// Vendor specific 1. + pub const VEND1: Self = Mmd(uapi::MDIO_MMD_VEND1 as u8); + /// Vendor specific 2. + pub const VEND2: Self = Mmd(uapi::MDIO_MMD_VEND2 as u8); +} + +/// A single MDIO clause 45 register device and address. +/// +/// Clause 45 uses a 5-bit device address to access a specific MMD within +/// a port, then a 16-bit register address to access a location within +/// that device. `C45` represents this by storing a [`Mmd`] and +/// a register number. +pub struct C45 { + devad: Mmd, + regnum: u16, +} + +impl C45 { + /// Creates a new instance of `C45`. + pub fn new(devad: Mmd, regnum: u16) -> Self { + Self { devad, regnum } + } +} + +impl private::Sealed for C45 {} + +impl Register for C45 { + fn read(&self, dev: &mut Device) -> Result { + let phydev = dev.0.get(); + // SAFETY: `phydev` is pointing to a valid object by the type invariant of `Device`. + // So it's just an FFI call. + let ret = + unsafe { bindings::phy_read_mmd(phydev, self.devad.0.into(), self.regnum.into()) }; + to_result(ret)?; + Ok(ret as u16) + } + + fn write(&self, dev: &mut Device, val: u16) -> Result { + let phydev = dev.0.get(); + // SAFETY: `phydev` is pointing to a valid object by the type invariant of `Device`. + // So it's just an FFI call. + to_result(unsafe { + bindings::phy_write_mmd(phydev, self.devad.0.into(), self.regnum.into(), val) + }) + } +} diff --git a/rust/uapi/uapi_helper.h b/rust/uapi/uapi_helper.h index 08f5e9334c9e83..76d3f103e76499 100644 --- a/rust/uapi/uapi_helper.h +++ b/rust/uapi/uapi_helper.h @@ -7,5 +7,6 @@ */ #include +#include #include #include From 5114e05a3cfa61c2ea20fa2e223a8e519aa163e4 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 28 Aug 2024 07:35:15 +0000 Subject: [PATCH 05/29] rust: net::phy unified genphy_read_status function for C22 and C45 registers Add unified genphy_read_status function for C22 and C45 registers. Instead of having genphy_c22 and genphy_c45 methods, this unifies genphy_read_status functions for C22 and C45. Reviewed-by: Trevor Gross Reviewed-by: Benno Lossin Reviewed-by: Andrew Lunn Signed-off-by: FUJITA Tomonori Signed-off-by: David S. Miller --- rust/kernel/net/phy.rs | 12 ++---------- rust/kernel/net/phy/reg.rs | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index 45866db14c76cc..1d47884aa3cf18 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -252,16 +252,8 @@ impl Device { } /// Checks the link status and updates current link state. - pub fn genphy_read_status(&mut self) -> Result { - let phydev = self.0.get(); - // SAFETY: `phydev` is pointing to a valid object by the type invariant of `Self`. - // So it's just an FFI call. - let ret = unsafe { bindings::genphy_read_status(phydev) }; - if ret < 0 { - Err(Error::from_errno(ret)) - } else { - Ok(ret as u16) - } + pub fn genphy_read_status(&mut self) -> Result { + R::read_status(self) } /// Updates the link status. diff --git a/rust/kernel/net/phy/reg.rs b/rust/kernel/net/phy/reg.rs index 4563737a96759a..a7db0064cb7d60 100644 --- a/rust/kernel/net/phy/reg.rs +++ b/rust/kernel/net/phy/reg.rs @@ -31,6 +31,13 @@ mod private { /// dev.read(C22::BMCR); /// // read C45 PMA/PMD control 1 register /// dev.read(C45::new(Mmd::PMAPMD, 0)); +/// +/// // Checks the link status as reported by registers in the C22 namespace +/// // and updates current link state. +/// dev.genphy_read_status::(); +/// // Checks the link status as reported by registers in the C45 namespace +/// // and updates current link state. +/// dev.genphy_read_status::(); /// } /// ``` pub trait Register: private::Sealed { @@ -39,6 +46,9 @@ pub trait Register: private::Sealed { /// Writes a PHY register. fn write(&self, dev: &mut Device, val: u16) -> Result; + + /// Checks the link status and updates current link state. + fn read_status(dev: &mut Device) -> Result; } /// A single MDIO clause 22 register address (5 bits). @@ -113,6 +123,15 @@ impl Register for C22 { bindings::mdiobus_write((*phydev).mdio.bus, (*phydev).mdio.addr, self.0.into(), val) }) } + + fn read_status(dev: &mut Device) -> Result { + let phydev = dev.0.get(); + // SAFETY: `phydev` is pointing to a valid object by the type invariant of `Self`. + // So it's just an FFI call. + let ret = unsafe { bindings::genphy_read_status(phydev) }; + to_result(ret)?; + Ok(ret as u16) + } } /// A single MDIO clause 45 register device and address. @@ -193,4 +212,13 @@ impl Register for C45 { bindings::phy_write_mmd(phydev, self.devad.0.into(), self.regnum.into(), val) }) } + + fn read_status(dev: &mut Device) -> Result { + let phydev = dev.0.get(); + // SAFETY: `phydev` is pointing to a valid object by the type invariant of `Self`. + // So it's just an FFI call. + let ret = unsafe { bindings::genphy_c45_read_status(phydev) }; + to_result(ret)?; + Ok(ret as u16) + } } From fd3eaad826daf4774835599e264b216a30129c32 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 28 Aug 2024 07:35:16 +0000 Subject: [PATCH 06/29] net: phy: add Applied Micro QT2025 PHY driver This driver supports Applied Micro Circuits Corporation QT2025 PHY, based on a driver for Tehuti Networks TN40xx chips. The original driver for TN40xx chips supports multiple PHY hardware (AMCC QT2025, TI TLK10232, Aqrate AQR105, and Marvell 88X3120, 88X3310, and MV88E2010). This driver is extracted from the original driver and modified to a PHY driver in Rust. This has been tested with Edimax EN-9320SFP+ 10G network adapter. Reviewed-by: Trevor Gross Reviewed-by: Andrew Lunn Signed-off-by: FUJITA Tomonori Signed-off-by: David S. Miller --- MAINTAINERS | 8 +++ drivers/net/phy/Kconfig | 7 +++ drivers/net/phy/Makefile | 1 + drivers/net/phy/qt2025.rs | 103 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 119 insertions(+) create mode 100644 drivers/net/phy/qt2025.rs diff --git a/MAINTAINERS b/MAINTAINERS index 66107b9960aaf3..baf88e74c907fc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1609,6 +1609,14 @@ F: Documentation/admin-guide/perf/xgene-pmu.rst F: Documentation/devicetree/bindings/perf/apm-xgene-pmu.txt F: drivers/perf/xgene_pmu.c +APPLIED MICRO QT2025 PHY DRIVER +M: FUJITA Tomonori +R: Trevor Gross +L: netdev@vger.kernel.org +L: rust-for-linux@vger.kernel.org +S: Maintained +F: drivers/net/phy/qt2025.rs + APTINA CAMERA SENSOR PLL M: Laurent Pinchart L: linux-media@vger.kernel.org diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index f530fcd092fe4f..01b235b3bb7e80 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -112,6 +112,13 @@ config ADIN1100_PHY Currently supports the: - ADIN1100 - Robust,Industrial, Low Power 10BASE-T1L Ethernet PHY +config AMCC_QT2025_PHY + tristate "AMCC QT2025 PHY" + depends on RUST_PHYLIB_ABSTRACTIONS + depends on RUST_FW_LOADER_ABSTRACTIONS + help + Adds support for the Applied Micro Circuits Corporation QT2025 PHY. + source "drivers/net/phy/aquantia/Kconfig" config AX88796B_PHY diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 33adb3df5f64f1..90f886844381d0 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_ADIN_PHY) += adin.o obj-$(CONFIG_ADIN1100_PHY) += adin1100.o obj-$(CONFIG_AIR_EN8811H_PHY) += air_en8811h.o obj-$(CONFIG_AMD_PHY) += amd.o +obj-$(CONFIG_AMCC_QT2025_PHY) += qt2025.o obj-$(CONFIG_AQUANTIA_PHY) += aquantia/ ifdef CONFIG_AX88796B_RUST_PHY obj-$(CONFIG_AX88796B_PHY) += ax88796b_rust.o diff --git a/drivers/net/phy/qt2025.rs b/drivers/net/phy/qt2025.rs new file mode 100644 index 00000000000000..28d8981f410bb5 --- /dev/null +++ b/drivers/net/phy/qt2025.rs @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) Tehuti Networks Ltd. +// Copyright (C) 2024 FUJITA Tomonori + +//! Applied Micro Circuits Corporation QT2025 PHY driver +//! +//! This driver is based on the vendor driver `QT2025_phy.c`. This source +//! and firmware can be downloaded on the EN-9320SFP+ support site. +//! +//! The QT2025 PHY integrates an Intel 8051 micro-controller. + +use kernel::c_str; +use kernel::error::code; +use kernel::firmware::Firmware; +use kernel::net::phy::{ + self, + reg::{Mmd, C45}, + DeviceId, Driver, +}; +use kernel::prelude::*; +use kernel::sizes::{SZ_16K, SZ_8K}; + +kernel::module_phy_driver! { + drivers: [PhyQT2025], + device_table: [ + DeviceId::new_with_driver::(), + ], + name: "qt2025_phy", + author: "FUJITA Tomonori ", + description: "AMCC QT2025 PHY driver", + license: "GPL", + firmware: ["qt2025-2.0.3.3.fw"], +} + +struct PhyQT2025; + +#[vtable] +impl Driver for PhyQT2025 { + const NAME: &'static CStr = c_str!("QT2025 10Gpbs SFP+"); + const PHY_DEVICE_ID: phy::DeviceId = phy::DeviceId::new_with_exact_mask(0x0043a400); + + fn probe(dev: &mut phy::Device) -> Result<()> { + // Check the hardware revision code. + // Only 0x3b works with this driver and firmware. + let hw_rev = dev.read(C45::new(Mmd::PMAPMD, 0xd001))?; + if (hw_rev >> 8) != 0xb3 { + return Err(code::ENODEV); + } + + // `MICRO_RESETN`: hold the micro-controller in reset while configuring. + dev.write(C45::new(Mmd::PMAPMD, 0xc300), 0x0000)?; + // `SREFCLK_FREQ`: configure clock frequency of the micro-controller. + dev.write(C45::new(Mmd::PMAPMD, 0xc302), 0x0004)?; + // Non loopback mode. + dev.write(C45::new(Mmd::PMAPMD, 0xc319), 0x0038)?; + // `CUS_LAN_WAN_CONFIG`: select between LAN and WAN (WIS) mode. + dev.write(C45::new(Mmd::PMAPMD, 0xc31a), 0x0098)?; + // The following writes use standardized registers (3.38 through + // 3.41 5/10/25GBASE-R PCS test pattern seed B) for something else. + // We don't know what. + dev.write(C45::new(Mmd::PCS, 0x0026), 0x0e00)?; + dev.write(C45::new(Mmd::PCS, 0x0027), 0x0893)?; + dev.write(C45::new(Mmd::PCS, 0x0028), 0xa528)?; + dev.write(C45::new(Mmd::PCS, 0x0029), 0x0003)?; + // Configure transmit and recovered clock. + dev.write(C45::new(Mmd::PMAPMD, 0xa30a), 0x06e1)?; + // `MICRO_RESETN`: release the micro-controller from the reset state. + dev.write(C45::new(Mmd::PMAPMD, 0xc300), 0x0002)?; + // The micro-controller will start running from the boot ROM. + dev.write(C45::new(Mmd::PCS, 0xe854), 0x00c0)?; + + let fw = Firmware::request(c_str!("qt2025-2.0.3.3.fw"), dev.as_ref())?; + if fw.data().len() > SZ_16K + SZ_8K { + return Err(code::EFBIG); + } + + // The 24kB of program memory space is accessible by MDIO. + // The first 16kB of memory is located in the address range 3.8000h - 3.BFFFh. + // The next 8kB of memory is located at 4.8000h - 4.9FFFh. + let mut dst_offset = 0; + let mut dst_mmd = Mmd::PCS; + for (src_idx, val) in fw.data().iter().enumerate() { + if src_idx == SZ_16K { + // Start writing to the next register with no offset + dst_offset = 0; + dst_mmd = Mmd::PHYXS; + } + + dev.write(C45::new(dst_mmd, 0x8000 + dst_offset), (*val).into())?; + + dst_offset += 1; + } + // The micro-controller will start running from SRAM. + dev.write(C45::new(Mmd::PCS, 0xe854), 0x0040)?; + + // TODO: sleep here until the hw becomes ready. + Ok(()) + } + + fn read_status(dev: &mut phy::Device) -> Result { + dev.genphy_read_status::() + } +} From 74ce94ac38a6eac2ffc235739294f24964fd0a86 Mon Sep 17 00:00:00 2001 From: Shen Lichuan Date: Thu, 29 Aug 2024 10:12:53 +0800 Subject: [PATCH 07/29] sfc: Convert to use ERR_CAST() As opposed to open-code, using the ERR_CAST macro clearly indicates that this is a pointer to an error value and a type conversion was performed. Signed-off-by: Shen Lichuan Reviewed-by: Jacob Keller Reviewed-by: Martin Habets Reviewed-by: Edward Cree Link: https://patch.msgid.link/20240829021253.3066-1-shenlichuan@vivo.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/tc_counters.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/tc_counters.c b/drivers/net/ethernet/sfc/tc_counters.c index c4408842432363..a421b012350694 100644 --- a/drivers/net/ethernet/sfc/tc_counters.c +++ b/drivers/net/ethernet/sfc/tc_counters.c @@ -249,7 +249,7 @@ struct efx_tc_counter_index *efx_tc_flower_get_counter_index( &ctr->linkage, efx_tc_counter_id_ht_params); kfree(ctr); - return (void *)cnt; /* it's an ERR_PTR */ + return ERR_CAST(cnt); } ctr->cnt = cnt; refcount_set(&ctr->ref, 1); From f24f966feb62164f4a68d1b84e866504904ac4ac Mon Sep 17 00:00:00 2001 From: Shen Lichuan Date: Thu, 29 Aug 2024 15:25:38 +0800 Subject: [PATCH 08/29] nfp: Convert to use ERR_CAST() Use ERR_CAST() as it is designed for casting an error pointer to another type. Signed-off-by: Shen Lichuan Link: https://patch.msgid.link/20240829072538.33195-1-shenlichuan@vivo.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c index 7136bc48530ba0..df0234a338a8b1 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c @@ -278,7 +278,7 @@ struct nfp_nsp *nfp_nsp_open(struct nfp_cpp *cpp) res = nfp_resource_acquire(cpp, NFP_RESOURCE_NSP); if (IS_ERR(res)) - return (void *)res; + return ERR_CAST(res); state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) { From b26b64493343659cce8bbffa358bf39e4f68bdec Mon Sep 17 00:00:00 2001 From: Yan Zhen Date: Thu, 29 Aug 2024 17:55:09 +0800 Subject: [PATCH 09/29] net: openvswitch: Use ERR_CAST() to return Using ERR_CAST() is more reasonable and safer, When it is necessary to convert the type of an error pointer and return it. Signed-off-by: Yan Zhen Acked-by: Eelco Chaudron Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20240829095509.3151987-1-yanzhen@vivo.com Signed-off-by: Jakub Kicinski --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c92bdc4dfe1960..729ef582a3a8b8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2491,7 +2491,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) - return (void *)acts; + return ERR_CAST(acts); memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); acts->actions_len = (*sfa)->actions_len; From 8c2bd38b95f75f3d2a08c93e35303e26d480d24e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Aug 2024 14:46:39 +0000 Subject: [PATCH 10/29] icmp: change the order of rate limits ICMP messages are ratelimited : After the blamed commits, the two rate limiters are applied in this order: 1) host wide ratelimit (icmp_global_allow()) 2) Per destination ratelimit (inetpeer based) In order to avoid side-channels attacks, we need to apply the per destination check first. This patch makes the following change : 1) icmp_global_allow() checks if the host wide limit is reached. But credits are not yet consumed. This is deferred to 3) 2) The per destination limit is checked/updated. This might add a new node in inetpeer tree. 3) icmp_global_consume() consumes tokens if prior operations succeeded. This means that host wide ratelimit is still effective in keeping inetpeer tree small even under DDOS. As a bonus, I removed icmp_global.lock as the fast path can use a lock-free operation. Fixes: c0303efeab73 ("net: reduce cycles spend on ICMP replies that gets rate limited") Fixes: 4cdf507d5452 ("icmp: add a global rate limitation") Reported-by: Keyu Man Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Cc: Jesper Dangaard Brouer Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20240829144641.3880376-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 2 + net/ipv4/icmp.c | 103 ++++++++++++++++++++++++++--------------------- net/ipv6/icmp.c | 28 ++++++++----- 3 files changed, 76 insertions(+), 57 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index c5606cadb1a552..82248813619e3f 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -795,6 +795,8 @@ static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) } bool icmp_global_allow(void); +void icmp_global_consume(void); + extern int sysctl_icmp_msgs_per_sec; extern int sysctl_icmp_msgs_burst; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index b8f56d03fcbb62..0078e8fb2e86d0 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -224,57 +224,59 @@ int sysctl_icmp_msgs_per_sec __read_mostly = 1000; int sysctl_icmp_msgs_burst __read_mostly = 50; static struct { - spinlock_t lock; - u32 credit; + atomic_t credit; u32 stamp; -} icmp_global = { - .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), -}; +} icmp_global; /** * icmp_global_allow - Are we allowed to send one more ICMP message ? * * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. - * Note: called with BH disabled + * Works in tandem with icmp_global_consume(). */ bool icmp_global_allow(void) { - u32 credit, delta, incr = 0, now = (u32)jiffies; - bool rc = false; + u32 delta, now, oldstamp; + int incr, new, old; - /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. The READ_ONCE() are paired - * with the following WRITE_ONCE() in this same function. + /* Note: many cpus could find this condition true. + * Then later icmp_global_consume() could consume more credits, + * this is an acceptable race. */ - if (!READ_ONCE(icmp_global.credit)) { - delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); - if (delta < HZ / 50) - return false; - } + if (atomic_read(&icmp_global.credit) > 0) + return true; - spin_lock(&icmp_global.lock); - delta = min_t(u32, now - icmp_global.stamp, HZ); - if (delta >= HZ / 50) { - incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ; - if (incr) - WRITE_ONCE(icmp_global.stamp, now); - } - credit = min_t(u32, icmp_global.credit + incr, - READ_ONCE(sysctl_icmp_msgs_burst)); - if (credit) { - /* We want to use a credit of one in average, but need to randomize - * it for security reasons. - */ - credit = max_t(int, credit - get_random_u32_below(3), 0); - rc = true; + now = jiffies; + oldstamp = READ_ONCE(icmp_global.stamp); + delta = min_t(u32, now - oldstamp, HZ); + if (delta < HZ / 50) + return false; + + incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ; + if (!incr) + return false; + + if (cmpxchg(&icmp_global.stamp, oldstamp, now) == oldstamp) { + old = atomic_read(&icmp_global.credit); + do { + new = min(old + incr, READ_ONCE(sysctl_icmp_msgs_burst)); + } while (!atomic_try_cmpxchg(&icmp_global.credit, &old, new)); } - WRITE_ONCE(icmp_global.credit, credit); - spin_unlock(&icmp_global.lock); - return rc; + return true; } EXPORT_SYMBOL(icmp_global_allow); +void icmp_global_consume(void) +{ + int credits = get_random_u32_below(3); + + /* Note: this might make icmp_global.credit negative. */ + if (credits) + atomic_sub(credits, &icmp_global.credit); +} +EXPORT_SYMBOL(icmp_global_consume); + static bool icmpv4_mask_allow(struct net *net, int type, int code) { if (type > NR_ICMP_TYPES) @@ -291,14 +293,16 @@ static bool icmpv4_mask_allow(struct net *net, int type, int code) return false; } -static bool icmpv4_global_allow(struct net *net, int type, int code) +static bool icmpv4_global_allow(struct net *net, int type, int code, + bool *apply_ratelimit) { if (icmpv4_mask_allow(net, type, code)) return true; - if (icmp_global_allow()) + if (icmp_global_allow()) { + *apply_ratelimit = true; return true; - + } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -308,15 +312,16 @@ static bool icmpv4_global_allow(struct net *net, int type, int code) */ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - struct flowi4 *fl4, int type, int code) + struct flowi4 *fl4, int type, int code, + bool apply_ratelimit) { struct dst_entry *dst = &rt->dst; struct inet_peer *peer; bool rc = true; int vif; - if (icmpv4_mask_allow(net, type, code)) - goto out; + if (!apply_ratelimit) + return true; /* No rate limit on loopback */ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) @@ -331,6 +336,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); + else + icmp_global_consume(); return rc; } @@ -402,6 +409,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + bool apply_ratelimit = false; struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; @@ -413,11 +421,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) return; - /* Needed by both icmp_global_allow and icmp_xmit_lock */ + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); - /* global icmp_msgs_per_sec */ - if (!icmpv4_global_allow(net, type, code)) + /* is global icmp_msgs_per_sec exhausted ? */ + if (!icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); @@ -450,7 +458,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; - if (icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: @@ -596,6 +604,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, int room; struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); + bool apply_ratelimit = false; struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; @@ -677,7 +686,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, } } - /* Needed by both icmp_global_allow and icmp_xmit_lock */ + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless @@ -685,7 +694,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow) */ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) && - !icmpv4_global_allow(net, type, code)) + !icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); @@ -744,7 +753,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, goto out_unlock; /* peer icmp_ratelimit */ - if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 7b31674644efc3..46f70e4a835139 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -175,14 +175,16 @@ static bool icmpv6_mask_allow(struct net *net, int type) return false; } -static bool icmpv6_global_allow(struct net *net, int type) +static bool icmpv6_global_allow(struct net *net, int type, + bool *apply_ratelimit) { if (icmpv6_mask_allow(net, type)) return true; - if (icmp_global_allow()) + if (icmp_global_allow()) { + *apply_ratelimit = true; return true; - + } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -191,13 +193,13 @@ static bool icmpv6_global_allow(struct net *net, int type) * Check the ICMP output rate limit */ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi6 *fl6) + struct flowi6 *fl6, bool apply_ratelimit) { struct net *net = sock_net(sk); struct dst_entry *dst; bool res = false; - if (icmpv6_mask_allow(net, type)) + if (!apply_ratelimit) return true; /* @@ -228,6 +230,8 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), ICMP6_MIB_RATELIMITHOST); + else + icmp_global_consume(); dst_release(dst); return res; } @@ -452,6 +456,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; + bool apply_ratelimit = false; struct dst_entry *dst; struct icmp6hdr tmp_hdr; struct flowi6 fl6; @@ -533,11 +538,12 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, return; } - /* Needed by both icmp_global_allow and icmpv6_xmit_lock */ + /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) + if (!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, type, &apply_ratelimit)) goto out_bh_enable; mip6_addr_swap(skb, parm); @@ -575,7 +581,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, np = inet6_sk(sk); - if (!icmpv6_xrlim_allow(sk, type, &fl6)) + if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) goto out; tmp_hdr.icmp6_type = type; @@ -717,6 +723,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); + bool apply_ratelimit = false; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; @@ -781,8 +788,9 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) goto out; /* Check the ratelimit */ - if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || - !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) + if ((!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) || + !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit)) goto out_dst_release; idev = __in6_dev_get(skb->dev); From b056b4cd9178f7a1d5d57f7b48b073c29729ddaa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Aug 2024 14:46:40 +0000 Subject: [PATCH 11/29] icmp: move icmp_global.credit and icmp_global.stamp to per netns storage Host wide ICMP ratelimiter should be per netns, to provide better isolation. Following patch in this series makes the sysctl per netns. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20240829144641.3880376-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 4 ++-- include/net/netns/ipv4.h | 3 ++- net/ipv4/icmp.c | 26 +++++++++++--------------- net/ipv6/icmp.c | 4 ++-- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 82248813619e3f..d3bca4e83979f6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -794,8 +794,8 @@ static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0); } -bool icmp_global_allow(void); -void icmp_global_consume(void); +bool icmp_global_allow(struct net *net); +void icmp_global_consume(struct net *net); extern int sysctl_icmp_msgs_per_sec; extern int sysctl_icmp_msgs_burst; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 5fcd61ada62289..54fe7c079fffb2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -122,7 +122,8 @@ struct netns_ipv4 { u8 sysctl_icmp_errors_use_inbound_ifaddr; int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; - + atomic_t icmp_global_credit; + u32 icmp_global_stamp; u32 ip_rt_min_pmtu; int ip_rt_mtu_expires; int ip_rt_min_advmss; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 0078e8fb2e86d0..2e1d81dbdbb6fe 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -223,19 +223,15 @@ static inline void icmp_xmit_unlock(struct sock *sk) int sysctl_icmp_msgs_per_sec __read_mostly = 1000; int sysctl_icmp_msgs_burst __read_mostly = 50; -static struct { - atomic_t credit; - u32 stamp; -} icmp_global; - /** * icmp_global_allow - Are we allowed to send one more ICMP message ? + * @net: network namespace * * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. * Works in tandem with icmp_global_consume(). */ -bool icmp_global_allow(void) +bool icmp_global_allow(struct net *net) { u32 delta, now, oldstamp; int incr, new, old; @@ -244,11 +240,11 @@ bool icmp_global_allow(void) * Then later icmp_global_consume() could consume more credits, * this is an acceptable race. */ - if (atomic_read(&icmp_global.credit) > 0) + if (atomic_read(&net->ipv4.icmp_global_credit) > 0) return true; now = jiffies; - oldstamp = READ_ONCE(icmp_global.stamp); + oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp); delta = min_t(u32, now - oldstamp, HZ); if (delta < HZ / 50) return false; @@ -257,23 +253,23 @@ bool icmp_global_allow(void) if (!incr) return false; - if (cmpxchg(&icmp_global.stamp, oldstamp, now) == oldstamp) { - old = atomic_read(&icmp_global.credit); + if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) { + old = atomic_read(&net->ipv4.icmp_global_credit); do { new = min(old + incr, READ_ONCE(sysctl_icmp_msgs_burst)); - } while (!atomic_try_cmpxchg(&icmp_global.credit, &old, new)); + } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new)); } return true; } EXPORT_SYMBOL(icmp_global_allow); -void icmp_global_consume(void) +void icmp_global_consume(struct net *net) { int credits = get_random_u32_below(3); /* Note: this might make icmp_global.credit negative. */ if (credits) - atomic_sub(credits, &icmp_global.credit); + atomic_sub(credits, &net->ipv4.icmp_global_credit); } EXPORT_SYMBOL(icmp_global_consume); @@ -299,7 +295,7 @@ static bool icmpv4_global_allow(struct net *net, int type, int code, if (icmpv4_mask_allow(net, type, code)) return true; - if (icmp_global_allow()) { + if (icmp_global_allow(net)) { *apply_ratelimit = true; return true; } @@ -337,7 +333,7 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); else - icmp_global_consume(); + icmp_global_consume(net); return rc; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 46f70e4a835139..071b0bc1179d81 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -181,7 +181,7 @@ static bool icmpv6_global_allow(struct net *net, int type, if (icmpv6_mask_allow(net, type)) return true; - if (icmp_global_allow()) { + if (icmp_global_allow(net)) { *apply_ratelimit = true; return true; } @@ -231,7 +231,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, __ICMP6_INC_STATS(net, ip6_dst_idev(dst), ICMP6_MIB_RATELIMITHOST); else - icmp_global_consume(); + icmp_global_consume(net); dst_release(dst); return res; } From f17bf505ff89595df5147755e51441632a5dc563 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Aug 2024 14:46:41 +0000 Subject: [PATCH 12/29] icmp: icmp_msgs_per_sec and icmp_msgs_burst sysctls become per netns Previous patch made ICMP rate limits per netns, it makes sense to allow each netns to change the associated sysctl. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20240829144641.3880376-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 3 --- include/net/netns/ipv4.h | 2 ++ net/ipv4/icmp.c | 9 ++++----- net/ipv4/sysctl_net_ipv4.c | 32 ++++++++++++++++---------------- 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index d3bca4e83979f6..1ee472fa8b373e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -797,9 +797,6 @@ static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) bool icmp_global_allow(struct net *net); void icmp_global_consume(struct net *net); -extern int sysctl_icmp_msgs_per_sec; -extern int sysctl_icmp_msgs_burst; - #ifdef CONFIG_PROC_FS int ip_misc_proc_init(void); #endif diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 54fe7c079fffb2..276f622f351687 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -122,6 +122,8 @@ struct netns_ipv4 { u8 sysctl_icmp_errors_use_inbound_ifaddr; int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; + int sysctl_icmp_msgs_per_sec; + int sysctl_icmp_msgs_burst; atomic_t icmp_global_credit; u32 icmp_global_stamp; u32 ip_rt_min_pmtu; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2e1d81dbdbb6fe..1ed88883e1f257 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -220,9 +220,6 @@ static inline void icmp_xmit_unlock(struct sock *sk) spin_unlock(&sk->sk_lock.slock); } -int sysctl_icmp_msgs_per_sec __read_mostly = 1000; -int sysctl_icmp_msgs_burst __read_mostly = 50; - /** * icmp_global_allow - Are we allowed to send one more ICMP message ? * @net: network namespace @@ -249,14 +246,14 @@ bool icmp_global_allow(struct net *net) if (delta < HZ / 50) return false; - incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ; + incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ; if (!incr) return false; if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) { old = atomic_read(&net->ipv4.icmp_global_credit); do { - new = min(old + incr, READ_ONCE(sysctl_icmp_msgs_burst)); + new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst)); } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new)); } return true; @@ -1492,6 +1489,8 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + net->ipv4.sysctl_icmp_msgs_per_sec = 1000; + net->ipv4.sysctl_icmp_msgs_burst = 50; return 0; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4af0c234d8d763..a79b2a52ce01e6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -600,22 +600,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0444, .proc_handler = proc_tcp_available_ulp, }, - { - .procname = "icmp_msgs_per_sec", - .data = &sysctl_icmp_msgs_per_sec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "icmp_msgs_burst", - .data = &sysctl_icmp_msgs_burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, { .procname = "udp_mem", .data = &sysctl_udp_mem, @@ -701,6 +685,22 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "icmp_msgs_per_sec", + .data = &init_net.ipv4.sysctl_icmp_msgs_per_sec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "icmp_msgs_burst", + .data = &init_net.ipv4.sysctl_icmp_msgs_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, { .procname = "ping_group_range", .data = &init_net.ipv4.ping_group_range.range, From 6af91e3d2cfc8bb579b1aa2d22cd91f8c34acdf6 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Thu, 29 Aug 2024 15:57:42 +0000 Subject: [PATCH 13/29] Documentation: Add missing fields to net_cachelines Two fields, page_pools and *irq_moder, were added to struct net_device in commit 083772c9f972 ("net: page_pool: record pools per netdev") and commit f750dfe825b9 ("ethtool: provide customized dim profile management"), respectively. Add both to the net_cachelines documentation, as well. Signed-off-by: Joe Damato Link: https://patch.msgid.link/20240829155742.366584-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/net_device.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 70c4fb9d4e5ce0..a0e0fab8161ad5 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -176,3 +176,5 @@ netdevice_tracker dev_registered_tracker struct_rtnl_hw_stats64* offload_xstats_l3 struct_devlink_port* devlink_port struct_dpll_pin* dpll_pin +struct hlist_head page_pools +struct dim_irq_moder* irq_moder From cff69f72d33318f4ccfe7d5ff6c5616d00dd45a7 Mon Sep 17 00:00:00 2001 From: Diogo Jahchan Koike Date: Thu, 29 Aug 2024 15:48:27 -0300 Subject: [PATCH 14/29] ethtool: pse-pd: move pse validation into set Move validation into set, removing .set_validate operation as its current implementation holds the rtnl lock for acquiring the PHY device, defeating the intended purpose of checking before grabbing the lock. Reported-by: syzbot+ec369e6d58e210135f71@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ec369e6d58e210135f71 Fixes: 31748765bed3 ("net: ethtool: pse-pd: Target the command to the requested PHY") Signed-off-by: Diogo Jahchan Koike Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20240829184830.5861-1-djahchankoike@gmail.com Signed-off-by: Jakub Kicinski --- net/ethtool/pse-pd.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 507cb21d6bf0c5..a0705edca22a1a 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -222,13 +222,10 @@ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { }; static int -ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info) +ethnl_set_pse_validate(struct phy_device *phydev, struct genl_info *info) { struct nlattr **tb = info->attrs; - struct phy_device *phydev; - phydev = ethnl_req_get_phydev(req_info, tb[ETHTOOL_A_PSE_HEADER], - info->extack); if (IS_ERR_OR_NULL(phydev)) { NL_SET_ERR_MSG(info->extack, "No PHY is attached"); return -EOPNOTSUPP; @@ -254,7 +251,7 @@ ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info) return -EOPNOTSUPP; } - return 1; + return 0; } static int @@ -262,12 +259,13 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) { struct nlattr **tb = info->attrs; struct phy_device *phydev; - int ret = 0; + int ret; phydev = ethnl_req_get_phydev(req_info, tb[ETHTOOL_A_PSE_HEADER], info->extack); - if (IS_ERR_OR_NULL(phydev)) - return -ENODEV; + ret = ethnl_set_pse_validate(phydev, info); + if (ret) + return ret; if (tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]) { unsigned int pw_limit; @@ -314,7 +312,6 @@ const struct ethnl_request_ops ethnl_pse_request_ops = { .fill_reply = pse_fill_reply, .cleanup_data = pse_cleanup_data, - .set_validate = ethnl_set_pse_validate, .set = ethnl_set_pse, /* PSE has no notification */ }; From 47afa284b96c62bda127604e6091fd5c9fd7e42c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:48 +0300 Subject: [PATCH 15/29] ipv4: Unmask upper DSCP bits in RTM_GETROUTE output route lookup Unmask the upper DSCP bits when looking up an output route via the RTM_GETROUTE netlink message so that in the future the lookup could be performed according to the full DSCP value. No functional changes intended since the upper DSCP bits are masked when comparing against the TOS selectors in FIB rules and routes. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f6972b24664a00..e4b45aa184701d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3261,7 +3261,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; + fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; From a63cef46adcbedd4f4ea7401773a310edca53131 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:49 +0300 Subject: [PATCH 16/29] ipv4: Unmask upper DSCP bits in ip_route_output_key_hash() Unmask the upper DSCP bits so that in the future output routes could be looked up according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e4b45aa184701d..5a77dc6d9c720b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2618,7 +2618,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - fl4->flowi4_tos &= IPTOS_RT_MASK; + fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); From 4805646c42e51d2fbf142864d281473ad453ad5d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:50 +0300 Subject: [PATCH 17/29] ipv4: icmp: Unmask upper DSCP bits in icmp_route_lookup() The function is called to resolve a route for an ICMP message that is sent in response to a situation. Based on the type of the generated ICMP message, the function is either passed the DS field of the packet that generated the ICMP message or a DS field that is derived from it. Unmask the upper DSCP bits before resolving and output route via ip_route_output_key_hash() so that in the future the lookup could be performed according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1ed88883e1f257..d2463b6e390eb9 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -93,6 +93,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -497,7 +498,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_tos = tos & INET_DSCP_MASK; fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; From ff95cb5e521b60d046e6571ada697a0977b189c3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:51 +0300 Subject: [PATCH 18/29] ipv4: Unmask upper DSCP bits in ip_sock_rt_tos() The function is used to read the DS field that was stored in IPv4 sockets via the IP_TOS socket option so that it could be used to initialize the flowi4_tos field before resolving an output route. Unmask the upper DSCP bits so that in the future the output route lookup could be performed according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- include/net/route.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/route.h b/include/net/route.h index 93833cfe9c9681..b896f086ec8ed8 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -45,7 +46,7 @@ static inline __u8 ip_sock_rt_scope(const struct sock *sk) static inline __u8 ip_sock_rt_tos(const struct sock *sk) { - return RT_TOS(READ_ONCE(inet_sk(sk)->tos)); + return READ_ONCE(inet_sk(sk)->tos) & INET_DSCP_MASK; } struct ip_tunnel_info; From 356d054a4967e6190ee558b8e839fad3e9db35ec Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:52 +0300 Subject: [PATCH 19/29] ipv4: Unmask upper DSCP bits in get_rttos() The function is used by a few socket types to retrieve the TOS value with which to perform the FIB lookup for packets sent through the socket (flowi4_tos). If a DS field was passed using the IP_TOS control message, then it is used. Otherwise the one specified via the IP_TOS socket option. Unmask the upper DSCP bits so that in the future the lookup could be performed according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- include/net/ip.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/net/ip.h b/include/net/ip.h index 1ee472fa8b373e..d92d3bc3ec0e25 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -33,6 +33,7 @@ #include #include #include +#include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ @@ -258,7 +259,9 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) { - return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(READ_ONCE(inet->tos)); + u8 dsfield = ipc->tos != -1 ? ipc->tos : READ_ONCE(inet->tos); + + return dsfield & INET_DSCP_MASK; } /* datagram.c */ From f6c89e95555ace0cb10d01b07756bfa5db5ee7fa Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:53 +0300 Subject: [PATCH 20/29] ipv4: Unmask upper DSCP bits when building flow key build_sk_flow_key() and __build_flow_key() are used to build an IPv4 flow key before calling one of the FIB lookup APIs. Unmask the upper DSCP bits so that in the future the lookup could be performed according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv4/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5a77dc6d9c720b..723ac9181558c3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -512,7 +512,7 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, sk->sk_protocol; } - flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope, + flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } @@ -541,7 +541,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), - ip_sock_rt_tos(sk) & IPTOS_RT_MASK, + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, From b261b2c6c18bcb81d69de011fd991bdfb97259f7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:54 +0300 Subject: [PATCH 21/29] xfrm: Unmask upper DSCP bits in xfrm_get_tos() The function returns a value that is used to initialize 'flowi4_tos' before being passed to the FIB lookup API in the following call chain: xfrm_bundle_create() tos = xfrm_get_tos(fl, family) xfrm_dst_lookup(..., tos, ...) __xfrm_dst_lookup(..., tos, ...) xfrm4_dst_lookup(..., tos, ...) __xfrm4_dst_lookup(..., tos, ...) fl4->flowi4_tos = tos __ip_route_output_key(net, fl4) Unmask the upper DSCP bits so that in the future the output route lookup could be performed according to the full DSCP value. Remove IPTOS_RT_MASK since it is no longer used. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- include/net/route.h | 2 -- net/xfrm/xfrm_policy.c | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index b896f086ec8ed8..1789f1e6640b46 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -266,8 +266,6 @@ static inline void ip_rt_put(struct rtable *rt) dst_release(&rt->dst); } -#define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) - extern const __u8 ip_tos2prio[16]; static inline char rt_tos2priority(u8 tos) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c56c61b0c12ef2..b22767c0c078c6 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -45,6 +45,7 @@ #ifdef CONFIG_XFRM_ESPINTCP #include #endif +#include #include "xfrm_hash.h" @@ -2561,7 +2562,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static int xfrm_get_tos(const struct flowi *fl, int family) { if (family == AF_INET) - return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; + return fl->u.ip4.flowi4_tos & INET_DSCP_MASK; return 0; } From 13f6538de2b845650e68996621489de547b0337e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:55 +0300 Subject: [PATCH 22/29] ipv4: Unmask upper DSCP bits in ip_send_unicast_reply() The function calls flowi4_init_output() to initialize an IPv4 flow key with which it then performs a FIB lookup using ip_route_output_flow(). 'arg->tos' with which the TOS value in the IPv4 flow key (flowi4_tos) is initialized contains the full DS field. Unmask the upper DSCP bits so that in the future the FIB lookup could be performed according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b90d0f78ac8080..eea443b7f65e20 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -1621,7 +1622,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, - RT_TOS(arg->tos), + arg->tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, From 6a59526628ad6dadf389f45ddb3f75db44930897 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:56 +0300 Subject: [PATCH 23/29] ipv6: sit: Unmask upper DSCP bits in ipip6_tunnel_xmit() The function calls flowi4_init_output() to initialize an IPv4 flow key with which it then performs a FIB lookup using ip_route_output_flow(). The 'tos' variable with which the TOS value in the IPv4 flow key (flowi4_tos) is initialized contains the full DS field. Unmask the upper DSCP bits so that in the future the FIB lookup could be performed according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv6/sit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 83b195f09561d9..3b2eed7fc7659e 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -51,6 +51,7 @@ #include #include #include +#include /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -935,8 +936,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, - RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6, - 0, dst, tiph->saddr, 0, 0, + tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, + IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, sock_net_uid(tunnel->net, NULL)); rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); From 939cd1abf080c629552a9c5e6db4c0509d13e4c7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:57 +0300 Subject: [PATCH 24/29] ipvlan: Unmask upper DSCP bits in ipvlan_process_v4_outbound() Unmask the upper DSCP bits when calling ip_route_output_flow() so that in the future it could perform the FIB lookup according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index fef4eff7753a7a..b1afcb8740de12 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -2,6 +2,8 @@ /* Copyright (c) 2014 Mahesh Bandewar */ +#include + #include "ipvlan.h" static u32 ipvlan_jhash_secret __read_mostly; @@ -420,7 +422,7 @@ static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) int err, ret = NET_XMIT_DROP; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, - .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_tos = ip4h->tos & INET_DSCP_MASK, .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, .daddr = ip4h->daddr, From c5d8ffe29cf2873b62e4bc008ca48d045b6cde88 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:58 +0300 Subject: [PATCH 25/29] vrf: Unmask upper DSCP bits in vrf_process_v4_outbound() Unmask the upper DSCP bits when calling ip_route_output_flow() so that in the future it could perform the FIB lookup according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/vrf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 040f0bb36c0ea4..a900908eb24a6f 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -37,6 +37,7 @@ #include #include #include +#include #define DRV_NAME "vrf" #define DRV_VERSION "1.1" @@ -520,7 +521,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; - fl4.flowi4_tos = RT_TOS(ip4h->tos); + fl4.flowi4_tos = ip4h->tos & INET_DSCP_MASK; fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; From 50033400fc3a4744ea3ef6b7ec6443c5ec15a797 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 29 Aug 2024 09:54:59 +0300 Subject: [PATCH 26/29] bpf: Unmask upper DSCP bits in __bpf_redirect_neigh_v4() Unmask the upper DSCP bits when calling ip_route_output_flow() so that in the future it could perform the FIB lookup according to the full DSCP value. Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index f09d875cc05318..8569cd2482eebf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2372,7 +2372,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_tos = ip4h->tos & INET_DSCP_MASK, .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, From 4ebe78e15b95e8baaf7c3686694b59319b215f38 Mon Sep 17 00:00:00 2001 From: Srujana Challa Date: Thu, 29 Aug 2024 13:39:33 +0530 Subject: [PATCH 27/29] octeontx2-af: use dynamic interrupt vectors for CN10K This patch updates the driver to use a dynamic number of vectors instead of a hard-coded value. This change accommodates the CN10KB, which has 2 vectors, unlike the previously supported chips that have 3 vectors. Signed-off-by: Srujana Challa Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/marvell/octeontx2/af/mbox.h | 5 +- .../ethernet/marvell/octeontx2/af/rvu_cpt.c | 89 ++++++++++++++++--- .../marvell/octeontx2/af/rvu_struct.h | 6 +- 3 files changed, 80 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index ed2160cc5acb40..6ea2f3071fe8f7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -1856,8 +1856,9 @@ struct cpt_flt_eng_info_req { struct cpt_flt_eng_info_rsp { struct mbox_msghdr hdr; - u64 flt_eng_map[CPT_10K_AF_INT_VEC_RVU]; - u64 rcvrd_eng_map[CPT_10K_AF_INT_VEC_RVU]; +#define CPT_AF_MAX_FLT_INT_VECS 3 + u64 flt_eng_map[CPT_AF_MAX_FLT_INT_VECS]; + u64 rcvrd_eng_map[CPT_AF_MAX_FLT_INT_VECS]; u64 rsvd; }; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c index daf4b951e90591..cd5b21cb04271a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c @@ -19,6 +19,9 @@ /* Length of initial context fetch in 128 byte words */ #define CPT_CTX_ILEN 1ULL +/* Interrupt vector count of CPT RVU and RAS interrupts */ +#define CPT_10K_AF_RVU_RAS_INT_VEC_CNT 2 + #define cpt_get_eng_sts(e_min, e_max, rsp, etype) \ ({ \ u64 free_sts = 0, busy_sts = 0; \ @@ -37,6 +40,41 @@ (_rsp)->free_sts_##etype = free_sts; \ }) +#define MAX_AE GENMASK_ULL(47, 32) +#define MAX_IE GENMASK_ULL(31, 16) +#define MAX_SE GENMASK_ULL(15, 0) + +static u16 cpt_max_engines_get(struct rvu *rvu) +{ + u16 max_ses, max_ies, max_aes; + u64 reg; + + reg = rvu_read64(rvu, BLKADDR_CPT0, CPT_AF_CONSTANTS1); + max_ses = FIELD_GET(MAX_SE, reg); + max_ies = FIELD_GET(MAX_IE, reg); + max_aes = FIELD_GET(MAX_AE, reg); + + return max_ses + max_ies + max_aes; +} + +/* Number of flt interrupt vectors are depends on number of engines that the + * chip has. Each flt vector represents 64 engines. + */ +static int cpt_10k_flt_nvecs_get(struct rvu *rvu, u16 max_engs) +{ + int flt_vecs; + + flt_vecs = DIV_ROUND_UP(max_engs, 64); + + if (flt_vecs > CPT_10K_AF_INT_VEC_FLT_MAX) { + dev_warn_once(rvu->dev, "flt_vecs:%d exceeds the max vectors:%d\n", + flt_vecs, CPT_10K_AF_INT_VEC_FLT_MAX); + flt_vecs = CPT_10K_AF_INT_VEC_FLT_MAX; + } + + return flt_vecs; +} + static irqreturn_t cpt_af_flt_intr_handler(int vec, void *ptr) { struct rvu_block *block = ptr; @@ -150,17 +188,26 @@ static void cpt_10k_unregister_interrupts(struct rvu_block *block, int off) { struct rvu *rvu = block->rvu; int blkaddr = block->addr; - int i; + int i, flt_vecs; + u16 max_engs; + u8 nr; + + max_engs = cpt_max_engines_get(rvu); + flt_vecs = cpt_10k_flt_nvecs_get(rvu, max_engs); /* Disable all CPT AF interrupts */ - rvu_write64(rvu, blkaddr, CPT_AF_FLTX_INT_ENA_W1C(0), ~0ULL); - rvu_write64(rvu, blkaddr, CPT_AF_FLTX_INT_ENA_W1C(1), ~0ULL); - rvu_write64(rvu, blkaddr, CPT_AF_FLTX_INT_ENA_W1C(2), 0xFFFF); + for (i = CPT_10K_AF_INT_VEC_FLT0; i < flt_vecs; i++) { + nr = (max_engs > 64) ? 64 : max_engs; + max_engs -= nr; + rvu_write64(rvu, blkaddr, CPT_AF_FLTX_INT_ENA_W1C(i), + INTR_MASK(nr)); + } rvu_write64(rvu, blkaddr, CPT_AF_RVU_INT_ENA_W1C, 0x1); rvu_write64(rvu, blkaddr, CPT_AF_RAS_INT_ENA_W1C, 0x1); - for (i = 0; i < CPT_10K_AF_INT_VEC_CNT; i++) + /* CPT AF interrupt vectors are flt_int, rvu_int and ras_int. */ + for (i = 0; i < flt_vecs + CPT_10K_AF_RVU_RAS_INT_VEC_CNT; i++) if (rvu->irq_allocated[off + i]) { free_irq(pci_irq_vector(rvu->pdev, off + i), block); rvu->irq_allocated[off + i] = false; @@ -206,12 +253,18 @@ void rvu_cpt_unregister_interrupts(struct rvu *rvu) static int cpt_10k_register_interrupts(struct rvu_block *block, int off) { + int rvu_intr_vec, ras_intr_vec; struct rvu *rvu = block->rvu; int blkaddr = block->addr; irq_handler_t flt_fn; - int i, ret; + int i, ret, flt_vecs; + u16 max_engs; + u8 nr; + + max_engs = cpt_max_engines_get(rvu); + flt_vecs = cpt_10k_flt_nvecs_get(rvu, max_engs); - for (i = CPT_10K_AF_INT_VEC_FLT0; i < CPT_10K_AF_INT_VEC_RVU; i++) { + for (i = CPT_10K_AF_INT_VEC_FLT0; i < flt_vecs; i++) { sprintf(&rvu->irq_name[(off + i) * NAME_SIZE], "CPTAF FLT%d", i); switch (i) { @@ -229,20 +282,24 @@ static int cpt_10k_register_interrupts(struct rvu_block *block, int off) flt_fn, &rvu->irq_name[(off + i) * NAME_SIZE]); if (ret) goto err; - if (i == CPT_10K_AF_INT_VEC_FLT2) - rvu_write64(rvu, blkaddr, CPT_AF_FLTX_INT_ENA_W1S(i), 0xFFFF); - else - rvu_write64(rvu, blkaddr, CPT_AF_FLTX_INT_ENA_W1S(i), ~0ULL); + + nr = (max_engs > 64) ? 64 : max_engs; + max_engs -= nr; + rvu_write64(rvu, blkaddr, CPT_AF_FLTX_INT_ENA_W1S(i), + INTR_MASK(nr)); } - ret = rvu_cpt_do_register_interrupt(block, off + CPT_10K_AF_INT_VEC_RVU, + rvu_intr_vec = flt_vecs; + ras_intr_vec = rvu_intr_vec + 1; + + ret = rvu_cpt_do_register_interrupt(block, off + rvu_intr_vec, rvu_cpt_af_rvu_intr_handler, "CPTAF RVU"); if (ret) goto err; rvu_write64(rvu, blkaddr, CPT_AF_RVU_INT_ENA_W1S, 0x1); - ret = rvu_cpt_do_register_interrupt(block, off + CPT_10K_AF_INT_VEC_RAS, + ret = rvu_cpt_do_register_interrupt(block, off + ras_intr_vec, rvu_cpt_af_ras_intr_handler, "CPTAF RAS"); if (ret) @@ -921,13 +978,17 @@ int rvu_mbox_handler_cpt_flt_eng_info(struct rvu *rvu, struct cpt_flt_eng_info_r struct rvu_block *block; unsigned long flags; int blkaddr, vec; + int flt_vecs; + u16 max_engs; blkaddr = validate_and_get_cpt_blkaddr(req->blkaddr); if (blkaddr < 0) return blkaddr; block = &rvu->hw->block[blkaddr]; - for (vec = 0; vec < CPT_10K_AF_INT_VEC_RVU; vec++) { + max_engs = cpt_max_engines_get(rvu); + flt_vecs = cpt_10k_flt_nvecs_get(rvu, max_engs); + for (vec = 0; vec < flt_vecs; vec++) { spin_lock_irqsave(&rvu->cpt_intr_lock, flags); rsp->flt_eng_map[vec] = block->cpt_flt_eng_map[vec]; rsp->rcvrd_eng_map[vec] = block->cpt_rcvrd_eng_map[vec]; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h index 5ef406c7e8a44a..fc8da209065705 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h @@ -71,13 +71,11 @@ enum cpt_af_int_vec_e { CPT_AF_INT_VEC_CNT = 0x4, }; -enum cpt_10k_af_int_vec_e { +enum cpt_cn10k_flt_int_vec_e { CPT_10K_AF_INT_VEC_FLT0 = 0x0, CPT_10K_AF_INT_VEC_FLT1 = 0x1, CPT_10K_AF_INT_VEC_FLT2 = 0x2, - CPT_10K_AF_INT_VEC_RVU = 0x3, - CPT_10K_AF_INT_VEC_RAS = 0x4, - CPT_10K_AF_INT_VEC_CNT = 0x5, + CPT_10K_AF_INT_VEC_FLT_MAX = 0x3, }; /* NPA Admin function Interrupt Vector Enumeration */ From 1652623291c50a9ec4db3c416b7d01701b4012ff Mon Sep 17 00:00:00 2001 From: Srujana Challa Date: Thu, 29 Aug 2024 13:39:34 +0530 Subject: [PATCH 28/29] octeontx2-af: avoid RXC register access for CN10KB This patch modifies the driver to prevent access to RXC hardware registers on the CN10KB, as RXC is not available on this chip. Signed-off-by: Srujana Challa Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu.h | 10 ++++++++++ .../net/ethernet/marvell/octeontx2/af/rvu_cpt.c | 17 ++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 03ee93fd9e9498..64c9c9ee000d65 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -400,6 +400,7 @@ struct hw_cap { bool nix_multiple_dwrr_mtu; /* Multiple DWRR_MTU to choose from */ bool npc_hash_extract; /* Hash extract enabled ? */ bool npc_exact_match_enabled; /* Exact match supported ? */ + bool cpt_rxc; /* Is CPT-RXC supported */ }; struct rvu_hwinfo { @@ -690,6 +691,15 @@ static inline bool is_cnf10ka_a0(struct rvu *rvu) return false; } +static inline bool is_cn10kb(struct rvu *rvu) +{ + struct pci_dev *pdev = rvu->pdev; + + if (pdev->subsystem_device == PCI_SUBSYS_DEVID_CN10K_B) + return true; + return false; +} + static inline bool is_rvu_npc_hash_extract_en(struct rvu *rvu) { u64 npc_const3; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c index cd5b21cb04271a..d44614a63a7bba 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c @@ -789,6 +789,8 @@ int rvu_mbox_handler_cpt_rd_wr_register(struct rvu *rvu, static void get_ctx_pc(struct rvu *rvu, struct cpt_sts_rsp *rsp, int blkaddr) { + struct rvu_hwinfo *hw = rvu->hw; + if (is_rvu_otx2(rvu)) return; @@ -812,14 +814,16 @@ static void get_ctx_pc(struct rvu *rvu, struct cpt_sts_rsp *rsp, int blkaddr) rsp->ctx_err = rvu_read64(rvu, blkaddr, CPT_AF_CTX_ERR); rsp->ctx_enc_id = rvu_read64(rvu, blkaddr, CPT_AF_CTX_ENC_ID); rsp->ctx_flush_timer = rvu_read64(rvu, blkaddr, CPT_AF_CTX_FLUSH_TIMER); + rsp->x2p_link_cfg0 = rvu_read64(rvu, blkaddr, CPT_AF_X2PX_LINK_CFG(0)); + rsp->x2p_link_cfg1 = rvu_read64(rvu, blkaddr, CPT_AF_X2PX_LINK_CFG(1)); + if (!hw->cap.cpt_rxc) + return; rsp->rxc_time = rvu_read64(rvu, blkaddr, CPT_AF_RXC_TIME); rsp->rxc_time_cfg = rvu_read64(rvu, blkaddr, CPT_AF_RXC_TIME_CFG); rsp->rxc_active_sts = rvu_read64(rvu, blkaddr, CPT_AF_RXC_ACTIVE_STS); rsp->rxc_zombie_sts = rvu_read64(rvu, blkaddr, CPT_AF_RXC_ZOMBIE_STS); rsp->rxc_dfrg = rvu_read64(rvu, blkaddr, CPT_AF_RXC_DFRG); - rsp->x2p_link_cfg0 = rvu_read64(rvu, blkaddr, CPT_AF_X2PX_LINK_CFG(0)); - rsp->x2p_link_cfg1 = rvu_read64(rvu, blkaddr, CPT_AF_X2PX_LINK_CFG(1)); } static void get_eng_sts(struct rvu *rvu, struct cpt_sts_rsp *rsp, int blkaddr) @@ -1004,10 +1008,11 @@ int rvu_mbox_handler_cpt_flt_eng_info(struct rvu *rvu, struct cpt_flt_eng_info_r static void cpt_rxc_teardown(struct rvu *rvu, int blkaddr) { struct cpt_rxc_time_cfg_req req, prev; + struct rvu_hwinfo *hw = rvu->hw; int timeout = 2000; u64 reg; - if (is_rvu_otx2(rvu)) + if (!hw->cap.cpt_rxc) return; /* Set time limit to minimum values, so that rxc entries will be @@ -1282,8 +1287,14 @@ int rvu_cpt_ctx_flush(struct rvu *rvu, u16 pcifunc) int rvu_cpt_init(struct rvu *rvu) { + struct rvu_hwinfo *hw = rvu->hw; + /* Retrieve CPT PF number */ rvu->cpt_pf_num = get_cpt_pf_num(rvu); + if (is_block_implemented(rvu->hw, BLKADDR_CPT0) && !is_rvu_otx2(rvu) && + !is_cn10kb(rvu)) + hw->cap.cpt_rxc = true; + spin_lock_init(&rvu->cpt_intr_lock); return 0; From 5da8de8cb3e3b01fd838536c75a36b667eca128b Mon Sep 17 00:00:00 2001 From: Srujana Challa Date: Thu, 29 Aug 2024 13:39:35 +0530 Subject: [PATCH 29/29] octeontx2-af: configure default CPT credits for CN10KA B0 The maximum CPT credits that RXC can use are now configurable on CN10KA B0 through a hardware CSR. This patch sets the default value to optimize peak performance, aligning it with other chip versions. Signed-off-by: Srujana Challa Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/marvell/octeontx2/af/rvu.h | 20 +++++++++++++++++++ .../ethernet/marvell/octeontx2/af/rvu_cpt.c | 18 +++++++++++++++++ .../ethernet/marvell/octeontx2/af/rvu_reg.h | 1 + 3 files changed, 39 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 64c9c9ee000d65..43b1d83686d164 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -691,6 +691,26 @@ static inline bool is_cnf10ka_a0(struct rvu *rvu) return false; } +static inline bool is_cn10ka_a0(struct rvu *rvu) +{ + struct pci_dev *pdev = rvu->pdev; + + if (pdev->subsystem_device == PCI_SUBSYS_DEVID_CN10K_A && + (pdev->revision & 0x0F) == 0x0) + return true; + return false; +} + +static inline bool is_cn10ka_a1(struct rvu *rvu) +{ + struct pci_dev *pdev = rvu->pdev; + + if (pdev->subsystem_device == PCI_SUBSYS_DEVID_CN10K_A && + (pdev->revision & 0x0F) == 0x1) + return true; + return false; +} + static inline bool is_cn10kb(struct rvu *rvu) { struct pci_dev *pdev = rvu->pdev; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c index d44614a63a7bba..3c5bbaf12e594c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c @@ -22,6 +22,9 @@ /* Interrupt vector count of CPT RVU and RAS interrupts */ #define CPT_10K_AF_RVU_RAS_INT_VEC_CNT 2 +/* Default CPT_AF_RXC_CFG1:max_rxc_icb_cnt */ +#define CPT_DFLT_MAX_RXC_ICB_CNT 0xC0ULL + #define cpt_get_eng_sts(e_min, e_max, rsp, etype) \ ({ \ u64 free_sts = 0, busy_sts = 0; \ @@ -737,6 +740,7 @@ static bool validate_and_update_reg_offset(struct rvu *rvu, case CPT_AF_BLK_RST: case CPT_AF_CONSTANTS1: case CPT_AF_CTX_FLUSH_TIMER: + case CPT_AF_RXC_CFG1: return true; } @@ -1285,9 +1289,12 @@ int rvu_cpt_ctx_flush(struct rvu *rvu, u16 pcifunc) return 0; } +#define MAX_RXC_ICB_CNT GENMASK_ULL(40, 32) + int rvu_cpt_init(struct rvu *rvu) { struct rvu_hwinfo *hw = rvu->hw; + u64 reg_val; /* Retrieve CPT PF number */ rvu->cpt_pf_num = get_cpt_pf_num(rvu); @@ -1295,6 +1302,17 @@ int rvu_cpt_init(struct rvu *rvu) !is_cn10kb(rvu)) hw->cap.cpt_rxc = true; + if (hw->cap.cpt_rxc && !is_cn10ka_a0(rvu) && !is_cn10ka_a1(rvu)) { + /* Set CPT_AF_RXC_CFG1:max_rxc_icb_cnt to 0xc0 to not effect + * inline inbound peak performance + */ + reg_val = rvu_read64(rvu, BLKADDR_CPT0, CPT_AF_RXC_CFG1); + reg_val &= ~MAX_RXC_ICB_CNT; + reg_val |= FIELD_PREP(MAX_RXC_ICB_CNT, + CPT_DFLT_MAX_RXC_ICB_CNT); + rvu_write64(rvu, BLKADDR_CPT0, CPT_AF_RXC_CFG1, reg_val); + } + spin_lock_init(&rvu->cpt_intr_lock); return 0; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h index d56be5fb7eb4a2..2b299fa8515913 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h @@ -545,6 +545,7 @@ #define CPT_AF_CTX_PSH_PC (0x49450ull) #define CPT_AF_CTX_PSH_LATENCY_PC (0x49458ull) #define CPT_AF_CTX_CAM_DATA(a) (0x49800ull | (u64)(a) << 3) +#define CPT_AF_RXC_CFG1 (0x50000ull) #define CPT_AF_RXC_TIME (0x50010ull) #define CPT_AF_RXC_TIME_CFG (0x50018ull) #define CPT_AF_RXC_DFRG (0x50020ull)