On 4/19/2024 2:03 AM, Jiri Slaby wrote:
On 19. 04. 24, 10:44, Jiri Slaby wrote:
On 08. 04. 24, 14:56, Greg Kroah-Hartman wrote:
6.8-stable review patch. If anyone has any objections, please let me know.
From: Vitaly Lifshits vitaly.lifshits@intel.com
commit 6dbdd4de0362c37e54e8b049781402e5a409e7d0 upstream.
On some Meteor Lake systems accessing the PHY via the MDIO interface may result in an MDI error. This issue happens sporadically and in most cases a second access to the PHY via the MDIO interface results in success.
As a workaround, introduce a retry counter which is set to 3 on Meteor Lake systems. The driver will only return an error if 3 consecutive PHY access attempts fail. The retry mechanism is disabled in specific flows, where MDI errors are expected.
...
--- a/drivers/net/ethernet/intel/e1000e/phy.c +++ b/drivers/net/ethernet/intel/e1000e/phy.c @@ -107,6 +107,16 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw return e1e_wphy(hw, M88E1000_PHY_GEN_CONTROL, 0); } +void e1000e_disable_phy_retry(struct e1000_hw *hw) +{ + hw->phy.retry_enabled = false; +}
+void e1000e_enable_phy_retry(struct e1000_hw *hw) +{ + hw->phy.retry_enabled = true; +}
/** * e1000e_read_phy_reg_mdic - Read MDI control register * @hw: pointer to the HW structure @@ -118,55 +128,73 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw **/ s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) { + u32 i, mdic = 0, retry_counter, retry_max; struct e1000_phy_info *phy = &hw->phy; - u32 i, mdic = 0; + bool success; if (offset > MAX_PHY_REG_ADDRESS) { e_dbg("PHY Address %d is out of range\n", offset); return -E1000_ERR_PARAM; } + retry_max = phy->retry_enabled ? phy->retry_count : 0;
/* Set up Op-code, Phy Address, and register offset in the MDI * Control register. The MAC will take care of interfacing with the * PHY to retrieve the desired data. */ - mdic = ((offset << E1000_MDIC_REG_SHIFT) | - (phy->addr << E1000_MDIC_PHY_SHIFT) | - (E1000_MDIC_OP_READ));
- ew32(MDIC, mdic);
- /* Poll the ready bit to see if the MDI read completed - * Increasing the time out as testing showed failures with - * the lower time out - */ - for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { - udelay(50); - mdic = er32(MDIC); - if (mdic & E1000_MDIC_READY) - break; - } - if (!(mdic & E1000_MDIC_READY)) { - e_dbg("MDI Read PHY Reg Address %d did not complete\n", offset); - return -E1000_ERR_PHY; - } - if (mdic & E1000_MDIC_ERROR) { - e_dbg("MDI Read PHY Reg Address %d Error\n", offset); - return -E1000_ERR_PHY; - } - if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { - e_dbg("MDI Read offset error - requested %d, returned %d\n", - offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); - return -E1000_ERR_PHY; + for (retry_counter = 0; retry_counter <= retry_max; retry_counter++) { + success = true;
+ mdic = ((offset << E1000_MDIC_REG_SHIFT) | + (phy->addr << E1000_MDIC_PHY_SHIFT) | + (E1000_MDIC_OP_READ));
+ ew32(MDIC, mdic);
+ /* Poll the ready bit to see if the MDI read completed + * Increasing the time out as testing showed failures with + * the lower time out + */ + for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { + usleep_range(50, 60);
This crashes the kernel as a spinlock is held upper in the call stack in e1000_watchdog_task(): spin_lock(&adapter->stats64_lock); e1000e_update_stats(adapter); -> e1000e_update_phy_stats() -> e1000e_read_phy_reg_mdic() -> usleep_range() ----> Boom.
It was reported to our bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1223109
I believe, the mainline has the same bug.
Any ideas?
Obviously change the usleeps back to udelays? Or maybe only when retry_enabled is set?
Hi Jiri,
Should be the former. Could you give this patch a try? https://lore.kernel.org/intel-wired-lan/20240417190320.3159360-1-vitaly.lifs...
Thanks, Tony