As reported in https://bugzilla.kernel.org/206217 , raw_violation_fixup is causing more harm than good in some common use-cases.
This patch is a partial revert of the 191cd6fb5 commit: "PCI: tegra: Add SW fixup for RAW violations" that was first introduced in 5.3-rc1 kernel. This fix the following regression since then.
* Description: When both the NIC and MMC are used one can see the following message:
NETDEV WATCHDOG: enp1s0 (r8169): transmit queue 0 timed out
and
pcieport 0000:00:02.0: AER: Uncorrected (Non-Fatal) error received: 0000:01:00.0 r8169 0000:01:00.0: AER: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID) r8169 0000:01:00.0: AER: device [10ec:8168] error status/mask=00004000/00400000 r8169 0000:01:00.0: AER: [14] CmpltTO (First) r8169 0000:01:00.0: AER: can't recover (no error_detected callback) pcieport 0000:00:02.0: AER: device recovery failed
After that, the ethernet NIC isn't functional anymore even after reloading the r8169 module. After a reboot, this is reproducible by copying a large file over the NIC to the MMC.
For some reasons this cannot be reproduced when the same file is copied to a tmpfs.
* Little background on the fixup, by Manikanta Maddireddy: "In the internal testing with dGPU on Tegra124, CmplTO is reported by dGPU. This happened because FIFO queue in AFI(AXI to PCIe) module get full by upstream posted writes. Back to back upstream writes interleaved with infrequent reads, triggers RAW violation and CmpltTO. This is fixed by reducing the posted write credits and by changing updateFC timer frequency. These settings are fixed after stress test.
In the current case, RTL NIC is also reporting CmplTO. These settings seems to be aggravating the issue instead of fixing it."
v1: first non-RFC version - Disable raw_violation_fixup and fully remove unused code and macros
Signed-off-by: Nicolas Chauvet kwizart@gmail.com Reviewed-by: Manikanta Maddireddy mmaddireddy@nvidia.com Cc: stable@vger.kernel.org # 5.4.x --- drivers/pci/controller/pci-tegra.c | 32 ------------------------------ 1 file changed, 32 deletions(-)
diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index 235b456698fc..b532d5082fb6 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -181,13 +181,6 @@
#define AFI_PEXBIAS_CTRL_0 0x168
-#define RP_PRIV_XP_DL 0x00000494 -#define RP_PRIV_XP_DL_GEN2_UPD_FC_TSHOLD (0x1ff << 1) - -#define RP_RX_HDR_LIMIT 0x00000e00 -#define RP_RX_HDR_LIMIT_PW_MASK (0xff << 8) -#define RP_RX_HDR_LIMIT_PW (0x0e << 8) - #define RP_ECTL_2_R1 0x00000e84 #define RP_ECTL_2_R1_RX_CTLE_1C_MASK 0xffff
@@ -323,7 +316,6 @@ struct tegra_pcie_soc { bool program_uphy; bool update_clamp_threshold; bool program_deskew_time; - bool raw_violation_fixup; bool update_fc_timer; bool has_cache_bars; struct { @@ -659,23 +651,6 @@ static void tegra_pcie_apply_sw_fixup(struct tegra_pcie_port *port) writel(value, port->base + RP_VEND_CTL0); }
- /* Fixup for read after write violation. */ - if (soc->raw_violation_fixup) { - value = readl(port->base + RP_RX_HDR_LIMIT); - value &= ~RP_RX_HDR_LIMIT_PW_MASK; - value |= RP_RX_HDR_LIMIT_PW; - writel(value, port->base + RP_RX_HDR_LIMIT); - - value = readl(port->base + RP_PRIV_XP_DL); - value |= RP_PRIV_XP_DL_GEN2_UPD_FC_TSHOLD; - writel(value, port->base + RP_PRIV_XP_DL); - - value = readl(port->base + RP_VEND_XP); - value &= ~RP_VEND_XP_UPDATE_FC_THRESHOLD_MASK; - value |= soc->update_fc_threshold; - writel(value, port->base + RP_VEND_XP); - } - if (soc->update_fc_timer) { value = readl(port->base + RP_VEND_XP); value &= ~RP_VEND_XP_UPDATE_FC_THRESHOLD_MASK; @@ -2416,7 +2391,6 @@ static const struct tegra_pcie_soc tegra20_pcie = { .program_uphy = true, .update_clamp_threshold = false, .program_deskew_time = false, - .raw_violation_fixup = false, .update_fc_timer = false, .has_cache_bars = true, .ectl.enable = false, @@ -2446,7 +2420,6 @@ static const struct tegra_pcie_soc tegra30_pcie = { .program_uphy = true, .update_clamp_threshold = false, .program_deskew_time = false, - .raw_violation_fixup = false, .update_fc_timer = false, .has_cache_bars = false, .ectl.enable = false, @@ -2459,8 +2432,6 @@ static const struct tegra_pcie_soc tegra124_pcie = { .pads_pll_ctl = PADS_PLL_CTL_TEGRA30, .tx_ref_sel = PADS_PLL_CTL_TXCLKREF_BUF_EN, .pads_refclk_cfg0 = 0x44ac44ac, - /* FC threshold is bit[25:18] */ - .update_fc_threshold = 0x03fc0000, .has_pex_clkreq_en = true, .has_pex_bias_ctrl = true, .has_intr_prsnt_sense = true, @@ -2470,7 +2441,6 @@ static const struct tegra_pcie_soc tegra124_pcie = { .program_uphy = true, .update_clamp_threshold = true, .program_deskew_time = false, - .raw_violation_fixup = true, .update_fc_timer = false, .has_cache_bars = false, .ectl.enable = false, @@ -2494,7 +2464,6 @@ static const struct tegra_pcie_soc tegra210_pcie = { .program_uphy = true, .update_clamp_threshold = true, .program_deskew_time = true, - .raw_violation_fixup = false, .update_fc_timer = true, .has_cache_bars = false, .ectl = { @@ -2536,7 +2505,6 @@ static const struct tegra_pcie_soc tegra186_pcie = { .program_uphy = false, .update_clamp_threshold = false, .program_deskew_time = false, - .raw_violation_fixup = false, .update_fc_timer = false, .has_cache_bars = false, .ectl.enable = false,
Please update subject to follow the convention ("git log --online drivers/pci/controller/pci-tegra.c") to see it:
PCI: tegra: Revert tegra124 raw_violation_fixup
On Fri, Jul 17, 2020 at 11:35:10PM +0200, Nicolas Chauvet wrote:
As reported in https://bugzilla.kernel.org/206217 , raw_violation_fixup is causing more harm than good in some common use-cases.
This patch is a partial revert of the 191cd6fb5 commit: "PCI: tegra: Add SW fixup for RAW violations"
Usual style is: 191cd6fb5d2c ("PCI: tegra: Add SW fixup for RAW violations")
that was first introduced in 5.3-rc1 kernel. This fix the following regression since then.
- Description:
When both the NIC and MMC are used one can see the following message:
NETDEV WATCHDOG: enp1s0 (r8169): transmit queue 0 timed out
and
pcieport 0000:00:02.0: AER: Uncorrected (Non-Fatal) error received: 0000:01:00.0 r8169 0000:01:00.0: AER: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID) r8169 0000:01:00.0: AER: device [10ec:8168] error status/mask=00004000/00400000 r8169 0000:01:00.0: AER: [14] CmpltTO (First) r8169 0000:01:00.0: AER: can't recover (no error_detected callback) pcieport 0000:00:02.0: AER: device recovery failed
Indent the quoted text (messages) two spaces so it's distinct from the prose.
After that, the ethernet NIC isn't functional anymore even after reloading the r8169 module. After a reboot, this is reproducible by copying a large file over the NIC to the MMC.
This looks like two paragraphs; if so, put a blank line between them. Otherwise wrap them so they fill the line. It's hard to read when there are line breaks that look unnecessary.
For some reasons this cannot be reproduced when the same file is copied to a tmpfs.
- Little background on the fixup, by Manikanta Maddireddy: "In the internal testing with dGPU on Tegra124, CmplTO is reported by
dGPU. This happened because FIFO queue in AFI(AXI to PCIe) module get full by upstream posted writes. Back to back upstream writes interleaved with infrequent reads, triggers RAW violation and CmpltTO. This is fixed by reducing the posted write credits and by changing updateFC timer frequency. These settings are fixed after stress test.
In the current case, RTL NIC is also reporting CmplTO. These settings seems to be aggravating the issue instead of fixing it."
v1: first non-RFC version
- Disable raw_violation_fixup and fully remove unused code and macros
This version history can go after the "---" so it doesn't get included in the final commit log. It's nice if your subject line includes "[PATCH v2]" or whatever is appropriate.
Add this just before your Signed-off-by:
Fixes: 191cd6fb5d2c ("PCI: tegra: Add SW fixup for RAW violations")
Signed-off-by: Nicolas Chauvet kwizart@gmail.com Reviewed-by: Manikanta Maddireddy mmaddireddy@nvidia.com Cc: stable@vger.kernel.org # 5.4.x
No "<>" needed around stable@vger.kernel.org
You need not (and shouldn't) cc: stable@vger.kernel.org when you post this to the list. The stable tag here in the commit log is sufficient. Documentation/process/stable-kernel-rules.rst for more details.
Is v5.4.x really the oldest kernel that should get this fix? It looks like 191cd6fb5d2c appeared in v5.3.
Le ven. 17 juil. 2020 à 23:53, Bjorn Helgaas helgaas@kernel.org a écrit :
Thanks for the quick review. I've addressed all comments and I've resubmitted a v2 to https://www.spinics.net/lists/linux-pci/msg96863.html Unfortunately I've missed to modify the [Patch v2] tag. I hope this is fine. Let me know if I need to resend.
Is v5.4.x really the oldest kernel that should get this fix? It looks like 191cd6fb5d2c appeared in v5.3.
The commit was introduced in 5.3 indeed. I've added 5.4.x since it's the last maintained kernel from long term branches. Now I'm using the Fixes: tag, I've dropped the version of the kernel for stable as it seems duplicate and less accurate.
Thanks.
On Mon, Jul 20, 2020 at 09:02:59AM +0200, Nicolas Chauvet wrote:
Le ven. 17 juil. 2020 à 23:53, Bjorn Helgaas helgaas@kernel.org a écrit :
Thanks for the quick review. I've addressed all comments and I've resubmitted a v2 to https://www.spinics.net/lists/linux-pci/msg96863.html Unfortunately I've missed to modify the [Patch v2] tag. I hope this is fine. Let me know if I need to resend.
Is v5.4.x really the oldest kernel that should get this fix? It looks like 191cd6fb5d2c appeared in v5.3.
The commit was introduced in 5.3 indeed. I've added 5.4.x since it's the last maintained kernel from long term branches. Now I'm using the Fixes: tag, I've dropped the version of the kernel for stable as it seems duplicate and less accurate.
I have applied this patch to pci/tegra, thanks.
Lorenzo
linux-stable-mirror@lists.linaro.org