As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
1. fields updated only under the tx queue lock, 2. fields updated only during NAPI poll, 3. fields updated only from interrupt context,
Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own.
Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats().
For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats.
Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik petr@tesarici.cz --- drivers/net/ethernet/stmicro/stmmac/common.h | 56 +++++--- .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 15 +- .../net/ethernet/stmicro/stmmac/dwmac4_lib.c | 15 +- .../net/ethernet/stmicro/stmmac/dwmac_lib.c | 15 +- .../ethernet/stmicro/stmmac/dwxgmac2_dma.c | 15 +- .../ethernet/stmicro/stmmac/stmmac_ethtool.c | 129 +++++++++++------ .../net/ethernet/stmicro/stmmac/stmmac_main.c | 133 +++++++++--------- 7 files changed, 221 insertions(+), 157 deletions(-)
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 721c1f8e892f..4aca20feb4b7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -59,28 +59,51 @@ #undef FRAME_FILTER_DEBUG /* #define FRAME_FILTER_DEBUG */
+struct stmmac_q_tx_stats { + u64_stats_t tx_bytes; + u64_stats_t tx_set_ic_bit; + u64_stats_t tx_tso_frames; + u64_stats_t tx_tso_nfrags; +}; + +struct stmmac_napi_tx_stats { + u64_stats_t tx_packets; + u64_stats_t tx_pkt_n; + u64_stats_t poll; + u64_stats_t tx_clean; + u64_stats_t tx_set_ic_bit; +}; + struct stmmac_txq_stats { - u64 tx_bytes; - u64 tx_packets; - u64 tx_pkt_n; - u64 tx_normal_irq_n; - u64 napi_poll; - u64 tx_clean; - u64 tx_set_ic_bit; - u64 tx_tso_frames; - u64 tx_tso_nfrags; - struct u64_stats_sync syncp; + /* Updates protected by tx queue lock. */ + struct u64_stats_sync q_syncp; + struct stmmac_q_tx_stats q; + + /* Updates protected by NAPI poll logic. */ + struct u64_stats_sync napi_syncp; + struct stmmac_napi_tx_stats napi; } ____cacheline_aligned_in_smp;
+struct stmmac_napi_rx_stats { + u64_stats_t rx_bytes; + u64_stats_t rx_packets; + u64_stats_t rx_pkt_n; + u64_stats_t poll; +}; + struct stmmac_rxq_stats { - u64 rx_bytes; - u64 rx_packets; - u64 rx_pkt_n; - u64 rx_normal_irq_n; - u64 napi_poll; - struct u64_stats_sync syncp; + /* Updates protected by NAPI poll logic. */ + struct u64_stats_sync napi_syncp; + struct stmmac_napi_rx_stats napi; } ____cacheline_aligned_in_smp;
+/* Updates on each CPU protected by not allowing nested irqs. */ +struct stmmac_pcpu_stats { + struct u64_stats_sync syncp; + u64_stats_t rx_normal_irq_n[MTL_MAX_TX_QUEUES]; + u64_stats_t tx_normal_irq_n[MTL_MAX_RX_QUEUES]; +}; + /* Extra statistic and debug information exposed by ethtool */ struct stmmac_extra_stats { /* Transmit errors */ @@ -205,6 +228,7 @@ struct stmmac_extra_stats { /* per queue statistics */ struct stmmac_txq_stats txq_stats[MTL_MAX_TX_QUEUES]; struct stmmac_rxq_stats rxq_stats[MTL_MAX_RX_QUEUES]; + struct stmmac_pcpu_stats __percpu *pcpu_stats; unsigned long rx_dropped; unsigned long rx_errors; unsigned long tx_dropped; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 137741b94122..b21d99faa2d0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -441,8 +441,7 @@ static int sun8i_dwmac_dma_interrupt(struct stmmac_priv *priv, struct stmmac_extra_stats *x, u32 chan, u32 dir) { - struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan]; - struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan]; + struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); int ret = 0; u32 v;
@@ -455,9 +454,9 @@ static int sun8i_dwmac_dma_interrupt(struct stmmac_priv *priv,
if (v & EMAC_TX_INT) { ret |= handle_tx; - u64_stats_update_begin(&txq_stats->syncp); - txq_stats->tx_normal_irq_n++; - u64_stats_update_end(&txq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->tx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); }
if (v & EMAC_TX_DMA_STOP_INT) @@ -479,9 +478,9 @@ static int sun8i_dwmac_dma_interrupt(struct stmmac_priv *priv,
if (v & EMAC_RX_INT) { ret |= handle_rx; - u64_stats_update_begin(&rxq_stats->syncp); - rxq_stats->rx_normal_irq_n++; - u64_stats_update_end(&rxq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->rx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); }
if (v & EMAC_RX_BUF_UA_INT) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c index 9470d3fd2ded..0d185e54eb7e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c @@ -171,8 +171,7 @@ int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs; u32 intr_status = readl(ioaddr + DMA_CHAN_STATUS(dwmac4_addrs, chan)); u32 intr_en = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan)); - struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan]; - struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan]; + struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); int ret = 0;
if (dir == DMA_DIR_RX) @@ -201,15 +200,15 @@ int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, } /* TX/RX NORMAL interrupts */ if (likely(intr_status & DMA_CHAN_STATUS_RI)) { - u64_stats_update_begin(&rxq_stats->syncp); - rxq_stats->rx_normal_irq_n++; - u64_stats_update_end(&rxq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->rx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_rx; } if (likely(intr_status & DMA_CHAN_STATUS_TI)) { - u64_stats_update_begin(&txq_stats->syncp); - txq_stats->tx_normal_irq_n++; - u64_stats_update_end(&txq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->tx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_tx; }
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c index 7907d62d3437..85e18f9a22f9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c @@ -162,8 +162,7 @@ static void show_rx_process_state(unsigned int status) int dwmac_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, struct stmmac_extra_stats *x, u32 chan, u32 dir) { - struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan]; - struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan]; + struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); int ret = 0; /* read the status register (CSR5) */ u32 intr_status = readl(ioaddr + DMA_STATUS); @@ -215,16 +214,16 @@ int dwmac_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, u32 value = readl(ioaddr + DMA_INTR_ENA); /* to schedule NAPI on real RIE event. */ if (likely(value & DMA_INTR_ENA_RIE)) { - u64_stats_update_begin(&rxq_stats->syncp); - rxq_stats->rx_normal_irq_n++; - u64_stats_update_end(&rxq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->rx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_rx; } } if (likely(intr_status & DMA_STATUS_TI)) { - u64_stats_update_begin(&txq_stats->syncp); - txq_stats->tx_normal_irq_n++; - u64_stats_update_end(&txq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->tx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_tx; } if (unlikely(intr_status & DMA_STATUS_ERI)) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 3cde695fec91..dd2ab6185c40 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -337,8 +337,7 @@ static int dwxgmac2_dma_interrupt(struct stmmac_priv *priv, struct stmmac_extra_stats *x, u32 chan, u32 dir) { - struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan]; - struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan]; + struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); u32 intr_status = readl(ioaddr + XGMAC_DMA_CH_STATUS(chan)); u32 intr_en = readl(ioaddr + XGMAC_DMA_CH_INT_EN(chan)); int ret = 0; @@ -367,15 +366,15 @@ static int dwxgmac2_dma_interrupt(struct stmmac_priv *priv, /* TX/RX NORMAL interrupts */ if (likely(intr_status & XGMAC_NIS)) { if (likely(intr_status & XGMAC_RI)) { - u64_stats_update_begin(&rxq_stats->syncp); - rxq_stats->rx_normal_irq_n++; - u64_stats_update_end(&rxq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->rx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_rx; } if (likely(intr_status & (XGMAC_TI | XGMAC_TBU))) { - u64_stats_update_begin(&txq_stats->syncp); - txq_stats->tx_normal_irq_n++; - u64_stats_update_end(&txq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->tx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_tx; } } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 42d27b97dd1d..ec44becf0e2d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -549,44 +549,79 @@ stmmac_set_pauseparam(struct net_device *netdev, } }
+static u64 stmmac_get_rx_normal_irq_n(struct stmmac_priv *priv, int q) +{ + u64 total; + int cpu; + + total = 0; + for_each_possible_cpu(cpu) { + struct stmmac_pcpu_stats *pcpu; + unsigned int start; + u64 irq_n; + + pcpu = per_cpu_ptr(priv->xstats.pcpu_stats, cpu); + do { + start = u64_stats_fetch_begin(&pcpu->syncp); + irq_n = u64_stats_read(&pcpu->rx_normal_irq_n[q]); + } while (u64_stats_fetch_retry(&pcpu->syncp, start)); + total += irq_n; + } + return total; +} + +static u64 stmmac_get_tx_normal_irq_n(struct stmmac_priv *priv, int q) +{ + u64 total; + int cpu; + + total = 0; + for_each_possible_cpu(cpu) { + struct stmmac_pcpu_stats *pcpu; + unsigned int start; + u64 irq_n; + + pcpu = per_cpu_ptr(priv->xstats.pcpu_stats, cpu); + do { + start = u64_stats_fetch_begin(&pcpu->syncp); + irq_n = u64_stats_read(&pcpu->tx_normal_irq_n[q]); + } while (u64_stats_fetch_retry(&pcpu->syncp, start)); + total += irq_n; + } + return total; +} + static void stmmac_get_per_qstats(struct stmmac_priv *priv, u64 *data) { u32 tx_cnt = priv->plat->tx_queues_to_use; u32 rx_cnt = priv->plat->rx_queues_to_use; unsigned int start; - int q, stat; - char *p; + int q;
for (q = 0; q < tx_cnt; q++) { struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[q]; - struct stmmac_txq_stats snapshot; + u64 pkt_n;
do { - start = u64_stats_fetch_begin(&txq_stats->syncp); - snapshot = *txq_stats; - } while (u64_stats_fetch_retry(&txq_stats->syncp, start)); + start = u64_stats_fetch_begin(&txq_stats->napi_syncp); + pkt_n = u64_stats_read(&txq_stats->napi.tx_pkt_n); + } while (u64_stats_fetch_retry(&txq_stats->napi_syncp, start));
- p = (char *)&snapshot + offsetof(struct stmmac_txq_stats, tx_pkt_n); - for (stat = 0; stat < STMMAC_TXQ_STATS; stat++) { - *data++ = (*(u64 *)p); - p += sizeof(u64); - } + *data++ = pkt_n; + *data++ = stmmac_get_tx_normal_irq_n(priv, q); }
for (q = 0; q < rx_cnt; q++) { struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[q]; - struct stmmac_rxq_stats snapshot; + u64 pkt_n;
do { - start = u64_stats_fetch_begin(&rxq_stats->syncp); - snapshot = *rxq_stats; - } while (u64_stats_fetch_retry(&rxq_stats->syncp, start)); + start = u64_stats_fetch_begin(&rxq_stats->napi_syncp); + pkt_n = u64_stats_read(&rxq_stats->napi.rx_pkt_n); + } while (u64_stats_fetch_retry(&rxq_stats->napi_syncp, start));
- p = (char *)&snapshot + offsetof(struct stmmac_rxq_stats, rx_pkt_n); - for (stat = 0; stat < STMMAC_RXQ_STATS; stat++) { - *data++ = (*(u64 *)p); - p += sizeof(u64); - } + *data++ = pkt_n; + *data++ = stmmac_get_rx_normal_irq_n(priv, q); } }
@@ -645,39 +680,49 @@ static void stmmac_get_ethtool_stats(struct net_device *dev, pos = j; for (i = 0; i < rx_queues_count; i++) { struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[i]; - struct stmmac_rxq_stats snapshot; + struct stmmac_napi_rx_stats snapshot; + u64 n_irq;
j = pos; do { - start = u64_stats_fetch_begin(&rxq_stats->syncp); - snapshot = *rxq_stats; - } while (u64_stats_fetch_retry(&rxq_stats->syncp, start)); - - data[j++] += snapshot.rx_pkt_n; - data[j++] += snapshot.rx_normal_irq_n; - normal_irq_n += snapshot.rx_normal_irq_n; - napi_poll += snapshot.napi_poll; + start = u64_stats_fetch_begin(&rxq_stats->napi_syncp); + snapshot = rxq_stats->napi; + } while (u64_stats_fetch_retry(&rxq_stats->napi_syncp, start)); + + data[j++] += u64_stats_read(&snapshot.rx_pkt_n); + n_irq = stmmac_get_rx_normal_irq_n(priv, i); + data[j++] += n_irq; + normal_irq_n += n_irq; + napi_poll += u64_stats_read(&snapshot.poll); }
pos = j; for (i = 0; i < tx_queues_count; i++) { struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[i]; - struct stmmac_txq_stats snapshot; + struct stmmac_napi_tx_stats napi_snapshot; + struct stmmac_q_tx_stats q_snapshot; + u64 n_irq;
j = pos; do { - start = u64_stats_fetch_begin(&txq_stats->syncp); - snapshot = *txq_stats; - } while (u64_stats_fetch_retry(&txq_stats->syncp, start)); - - data[j++] += snapshot.tx_pkt_n; - data[j++] += snapshot.tx_normal_irq_n; - normal_irq_n += snapshot.tx_normal_irq_n; - data[j++] += snapshot.tx_clean; - data[j++] += snapshot.tx_set_ic_bit; - data[j++] += snapshot.tx_tso_frames; - data[j++] += snapshot.tx_tso_nfrags; - napi_poll += snapshot.napi_poll; + start = u64_stats_fetch_begin(&txq_stats->q_syncp); + q_snapshot = txq_stats->q; + } while (u64_stats_fetch_retry(&txq_stats->q_syncp, start)); + do { + start = u64_stats_fetch_begin(&txq_stats->napi_syncp); + napi_snapshot = txq_stats->napi; + } while (u64_stats_fetch_retry(&txq_stats->napi_syncp, start)); + + data[j++] += u64_stats_read(&napi_snapshot.tx_pkt_n); + n_irq = stmmac_get_tx_normal_irq_n(priv, i); + data[j++] += n_irq; + normal_irq_n += n_irq; + data[j++] += u64_stats_read(&napi_snapshot.tx_clean); + data[j++] += u64_stats_read(&q_snapshot.tx_set_ic_bit) + + u64_stats_read(&napi_snapshot.tx_set_ic_bit); + data[j++] += u64_stats_read(&q_snapshot.tx_tso_frames); + data[j++] += u64_stats_read(&q_snapshot.tx_tso_nfrags); + napi_poll += u64_stats_read(&napi_snapshot.poll); } normal_irq_n += priv->xstats.rx_early_irq; data[j++] = normal_irq_n; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 25519952f754..75d029704503 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2482,7 +2482,6 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) struct xdp_desc xdp_desc; bool work_done = true; u32 tx_set_ic_bit = 0; - unsigned long flags;
/* Avoids TX time-out as we are sharing with slow path */ txq_trans_cond_update(nq); @@ -2566,9 +2565,9 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, priv->dma_conf.dma_tx_size); entry = tx_q->cur_tx; } - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_set_ic_bit += tx_set_ic_bit; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->napi_syncp); + u64_stats_add(&txq_stats->napi.tx_set_ic_bit, tx_set_ic_bit); + u64_stats_update_end(&txq_stats->napi_syncp);
if (tx_desc) { stmmac_flush_tx_descriptors(priv, queue); @@ -2616,7 +2615,6 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue, unsigned int bytes_compl = 0, pkts_compl = 0; unsigned int entry, xmits = 0, count = 0; u32 tx_packets = 0, tx_errors = 0; - unsigned long flags;
__netif_tx_lock_bh(netdev_get_tx_queue(priv->dev, queue));
@@ -2782,11 +2780,11 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue, if (tx_q->dirty_tx != tx_q->cur_tx) *pending_packets = true;
- flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_packets += tx_packets; - txq_stats->tx_pkt_n += tx_packets; - txq_stats->tx_clean++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->napi_syncp); + u64_stats_add(&txq_stats->napi.tx_packets, tx_packets); + u64_stats_add(&txq_stats->napi.tx_pkt_n, tx_packets); + u64_stats_inc(&txq_stats->napi.tx_clean); + u64_stats_update_end(&txq_stats->napi_syncp);
priv->xstats.tx_errors += tx_errors;
@@ -4213,7 +4211,6 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) struct stmmac_tx_queue *tx_q; bool has_vlan, set_ic; u8 proto_hdr_len, hdr; - unsigned long flags; u32 pay_len, mss; dma_addr_t des; int i; @@ -4378,13 +4375,13 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, queue)); }
- flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_bytes += skb->len; - txq_stats->tx_tso_frames++; - txq_stats->tx_tso_nfrags += nfrags; + u64_stats_update_begin(&txq_stats->q_syncp); + u64_stats_add(&txq_stats->q.tx_bytes, skb->len); + u64_stats_inc(&txq_stats->q.tx_tso_frames); + u64_stats_add(&txq_stats->q.tx_tso_nfrags, nfrags); if (set_ic) - txq_stats->tx_set_ic_bit++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_inc(&txq_stats->q.tx_set_ic_bit); + u64_stats_update_end(&txq_stats->q_syncp);
if (priv->sarc_type) stmmac_set_desc_sarc(priv, first, priv->sarc_type); @@ -4483,7 +4480,6 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) struct stmmac_tx_queue *tx_q; bool has_vlan, set_ic; int entry, first_tx; - unsigned long flags; dma_addr_t des;
tx_q = &priv->dma_conf.tx_queue[queue]; @@ -4653,11 +4649,11 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, queue)); }
- flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_bytes += skb->len; + u64_stats_update_begin(&txq_stats->q_syncp); + u64_stats_add(&txq_stats->q.tx_bytes, skb->len); if (set_ic) - txq_stats->tx_set_ic_bit++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_inc(&txq_stats->q.tx_set_ic_bit); + u64_stats_update_end(&txq_stats->q_syncp);
if (priv->sarc_type) stmmac_set_desc_sarc(priv, first, priv->sarc_type); @@ -4921,12 +4917,11 @@ static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue, set_ic = false;
if (set_ic) { - unsigned long flags; tx_q->tx_count_frames = 0; stmmac_set_tx_ic(priv, tx_desc); - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_set_ic_bit++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->q_syncp); + u64_stats_inc(&txq_stats->q.tx_set_ic_bit); + u64_stats_update_end(&txq_stats->q_syncp); }
stmmac_enable_dma_transmission(priv, priv->ioaddr); @@ -5076,7 +5071,6 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue, unsigned int len = xdp->data_end - xdp->data; enum pkt_hash_types hash_type; int coe = priv->hw->rx_csum; - unsigned long flags; struct sk_buff *skb; u32 hash;
@@ -5106,10 +5100,10 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue, skb_record_rx_queue(skb, queue); napi_gro_receive(&ch->rxtx_napi, skb);
- flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->rx_pkt_n++; - rxq_stats->rx_bytes += len; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_inc(&rxq_stats->napi.rx_pkt_n); + u64_stats_add(&rxq_stats->napi.rx_bytes, len); + u64_stats_update_end(&rxq_stats->napi_syncp); }
static bool stmmac_rx_refill_zc(struct stmmac_priv *priv, u32 queue, u32 budget) @@ -5191,7 +5185,6 @@ static int stmmac_rx_zc(struct stmmac_priv *priv, int limit, u32 queue) unsigned int desc_size; struct bpf_prog *prog; bool failure = false; - unsigned long flags; int xdp_status = 0; int status = 0;
@@ -5346,9 +5339,9 @@ static int stmmac_rx_zc(struct stmmac_priv *priv, int limit, u32 queue)
stmmac_finalize_xdp_rx(priv, xdp_status);
- flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->rx_pkt_n += count; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_add(&rxq_stats->napi.rx_pkt_n, count); + u64_stats_update_end(&rxq_stats->napi_syncp);
priv->xstats.rx_dropped += rx_dropped; priv->xstats.rx_errors += rx_errors; @@ -5386,7 +5379,6 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) unsigned int desc_size; struct sk_buff *skb = NULL; struct stmmac_xdp_buff ctx; - unsigned long flags; int xdp_status = 0; int buf_sz;
@@ -5646,11 +5638,11 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
stmmac_rx_refill(priv, queue);
- flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->rx_packets += rx_packets; - rxq_stats->rx_bytes += rx_bytes; - rxq_stats->rx_pkt_n += count; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_add(&rxq_stats->napi.rx_packets, rx_packets); + u64_stats_add(&rxq_stats->napi.rx_bytes, rx_bytes); + u64_stats_add(&rxq_stats->napi.rx_pkt_n, count); + u64_stats_update_end(&rxq_stats->napi_syncp);
priv->xstats.rx_dropped += rx_dropped; priv->xstats.rx_errors += rx_errors; @@ -5665,13 +5657,12 @@ static int stmmac_napi_poll_rx(struct napi_struct *napi, int budget) struct stmmac_priv *priv = ch->priv_data; struct stmmac_rxq_stats *rxq_stats; u32 chan = ch->index; - unsigned long flags; int work_done;
rxq_stats = &priv->xstats.rxq_stats[chan]; - flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->napi_poll++; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_inc(&rxq_stats->napi.poll); + u64_stats_update_end(&rxq_stats->napi_syncp);
work_done = stmmac_rx(priv, budget, chan); if (work_done < budget && napi_complete_done(napi, work_done)) { @@ -5693,13 +5684,12 @@ static int stmmac_napi_poll_tx(struct napi_struct *napi, int budget) struct stmmac_txq_stats *txq_stats; bool pending_packets = false; u32 chan = ch->index; - unsigned long flags; int work_done;
txq_stats = &priv->xstats.txq_stats[chan]; - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->napi_poll++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->napi_syncp); + u64_stats_inc(&txq_stats->napi.poll); + u64_stats_update_end(&txq_stats->napi_syncp);
work_done = stmmac_tx_clean(priv, budget, chan, &pending_packets); work_done = min(work_done, budget); @@ -5729,17 +5719,16 @@ static int stmmac_napi_poll_rxtx(struct napi_struct *napi, int budget) struct stmmac_rxq_stats *rxq_stats; struct stmmac_txq_stats *txq_stats; u32 chan = ch->index; - unsigned long flags;
rxq_stats = &priv->xstats.rxq_stats[chan]; - flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->napi_poll++; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_inc(&rxq_stats->napi.poll); + u64_stats_update_end(&rxq_stats->napi_syncp);
txq_stats = &priv->xstats.txq_stats[chan]; - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->napi_poll++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->napi_syncp); + u64_stats_inc(&txq_stats->napi.poll); + u64_stats_update_end(&txq_stats->napi_syncp);
tx_done = stmmac_tx_clean(priv, budget, chan, &tx_pending_packets); tx_done = min(tx_done, budget); @@ -7065,10 +7054,13 @@ static void stmmac_get_stats64(struct net_device *dev, struct rtnl_link_stats64 u64 tx_bytes;
do { - start = u64_stats_fetch_begin(&txq_stats->syncp); - tx_packets = txq_stats->tx_packets; - tx_bytes = txq_stats->tx_bytes; - } while (u64_stats_fetch_retry(&txq_stats->syncp, start)); + start = u64_stats_fetch_begin(&txq_stats->q_syncp); + tx_bytes = u64_stats_read(&txq_stats->q.tx_bytes); + } while (u64_stats_fetch_retry(&txq_stats->q_syncp, start)); + do { + start = u64_stats_fetch_begin(&txq_stats->napi_syncp); + tx_packets = u64_stats_read(&txq_stats->napi.tx_packets); + } while (u64_stats_fetch_retry(&txq_stats->napi_syncp, start));
stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; @@ -7080,10 +7072,10 @@ static void stmmac_get_stats64(struct net_device *dev, struct rtnl_link_stats64 u64 rx_bytes;
do { - start = u64_stats_fetch_begin(&rxq_stats->syncp); - rx_packets = rxq_stats->rx_packets; - rx_bytes = rxq_stats->rx_bytes; - } while (u64_stats_fetch_retry(&rxq_stats->syncp, start)); + start = u64_stats_fetch_begin(&rxq_stats->napi_syncp); + rx_packets = u64_stats_read(&rxq_stats->napi.rx_packets); + rx_bytes = u64_stats_read(&rxq_stats->napi.rx_bytes); + } while (u64_stats_fetch_retry(&rxq_stats->napi_syncp, start));
stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; @@ -7477,9 +7469,16 @@ int stmmac_dvr_probe(struct device *device, priv->dev = ndev;
for (i = 0; i < MTL_MAX_RX_QUEUES; i++) - u64_stats_init(&priv->xstats.rxq_stats[i].syncp); - for (i = 0; i < MTL_MAX_TX_QUEUES; i++) - u64_stats_init(&priv->xstats.txq_stats[i].syncp); + u64_stats_init(&priv->xstats.rxq_stats[i].napi_syncp); + for (i = 0; i < MTL_MAX_TX_QUEUES; i++) { + u64_stats_init(&priv->xstats.txq_stats[i].q_syncp); + u64_stats_init(&priv->xstats.txq_stats[i].napi_syncp); + } + + priv->xstats.pcpu_stats = + devm_netdev_alloc_pcpu_stats(device, struct stmmac_pcpu_stats); + if (!priv->xstats.pcpu_stats) + return -ENOMEM;
stmmac_set_ethtool_ops(ndev); priv->pause = pause;
On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote:
As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
- fields updated only under the tx queue lock,
- fields updated only during NAPI poll,
- fields updated only from interrupt context,
Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own.
Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats().
For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats.
Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik petr@tesarici.cz
Reviewed-by: Jisheng Zhang jszhang@kernel.org
drivers/net/ethernet/stmicro/stmmac/common.h | 56 +++++--- .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 15 +- .../net/ethernet/stmicro/stmmac/dwmac4_lib.c | 15 +- .../net/ethernet/stmicro/stmmac/dwmac_lib.c | 15 +- .../ethernet/stmicro/stmmac/dwxgmac2_dma.c | 15 +- .../ethernet/stmicro/stmmac/stmmac_ethtool.c | 129 +++++++++++------ .../net/ethernet/stmicro/stmmac/stmmac_main.c | 133 +++++++++--------- 7 files changed, 221 insertions(+), 157 deletions(-)
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 721c1f8e892f..4aca20feb4b7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -59,28 +59,51 @@ #undef FRAME_FILTER_DEBUG /* #define FRAME_FILTER_DEBUG */ +struct stmmac_q_tx_stats {
- u64_stats_t tx_bytes;
- u64_stats_t tx_set_ic_bit;
- u64_stats_t tx_tso_frames;
- u64_stats_t tx_tso_nfrags;
+};
+struct stmmac_napi_tx_stats {
- u64_stats_t tx_packets;
- u64_stats_t tx_pkt_n;
- u64_stats_t poll;
- u64_stats_t tx_clean;
- u64_stats_t tx_set_ic_bit;
+};
struct stmmac_txq_stats {
- u64 tx_bytes;
- u64 tx_packets;
- u64 tx_pkt_n;
- u64 tx_normal_irq_n;
- u64 napi_poll;
- u64 tx_clean;
- u64 tx_set_ic_bit;
- u64 tx_tso_frames;
- u64 tx_tso_nfrags;
- struct u64_stats_sync syncp;
- /* Updates protected by tx queue lock. */
- struct u64_stats_sync q_syncp;
- struct stmmac_q_tx_stats q;
- /* Updates protected by NAPI poll logic. */
- struct u64_stats_sync napi_syncp;
- struct stmmac_napi_tx_stats napi;
} ____cacheline_aligned_in_smp; +struct stmmac_napi_rx_stats {
- u64_stats_t rx_bytes;
- u64_stats_t rx_packets;
- u64_stats_t rx_pkt_n;
- u64_stats_t poll;
+};
struct stmmac_rxq_stats {
- u64 rx_bytes;
- u64 rx_packets;
- u64 rx_pkt_n;
- u64 rx_normal_irq_n;
- u64 napi_poll;
- struct u64_stats_sync syncp;
- /* Updates protected by NAPI poll logic. */
- struct u64_stats_sync napi_syncp;
- struct stmmac_napi_rx_stats napi;
} ____cacheline_aligned_in_smp; +/* Updates on each CPU protected by not allowing nested irqs. */ +struct stmmac_pcpu_stats {
- struct u64_stats_sync syncp;
- u64_stats_t rx_normal_irq_n[MTL_MAX_TX_QUEUES];
- u64_stats_t tx_normal_irq_n[MTL_MAX_RX_QUEUES];
+};
/* Extra statistic and debug information exposed by ethtool */ struct stmmac_extra_stats { /* Transmit errors */ @@ -205,6 +228,7 @@ struct stmmac_extra_stats { /* per queue statistics */ struct stmmac_txq_stats txq_stats[MTL_MAX_TX_QUEUES]; struct stmmac_rxq_stats rxq_stats[MTL_MAX_RX_QUEUES];
- struct stmmac_pcpu_stats __percpu *pcpu_stats; unsigned long rx_dropped; unsigned long rx_errors; unsigned long tx_dropped;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 137741b94122..b21d99faa2d0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -441,8 +441,7 @@ static int sun8i_dwmac_dma_interrupt(struct stmmac_priv *priv, struct stmmac_extra_stats *x, u32 chan, u32 dir) {
- struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan];
- struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan];
- struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); int ret = 0; u32 v;
@@ -455,9 +454,9 @@ static int sun8i_dwmac_dma_interrupt(struct stmmac_priv *priv, if (v & EMAC_TX_INT) { ret |= handle_tx;
u64_stats_update_begin(&txq_stats->syncp);
txq_stats->tx_normal_irq_n++;
u64_stats_update_end(&txq_stats->syncp);
u64_stats_update_begin(&stats->syncp);
u64_stats_inc(&stats->tx_normal_irq_n[chan]);
}u64_stats_update_end(&stats->syncp);
if (v & EMAC_TX_DMA_STOP_INT) @@ -479,9 +478,9 @@ static int sun8i_dwmac_dma_interrupt(struct stmmac_priv *priv, if (v & EMAC_RX_INT) { ret |= handle_rx;
u64_stats_update_begin(&rxq_stats->syncp);
rxq_stats->rx_normal_irq_n++;
u64_stats_update_end(&rxq_stats->syncp);
u64_stats_update_begin(&stats->syncp);
u64_stats_inc(&stats->rx_normal_irq_n[chan]);
}u64_stats_update_end(&stats->syncp);
if (v & EMAC_RX_BUF_UA_INT) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c index 9470d3fd2ded..0d185e54eb7e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c @@ -171,8 +171,7 @@ int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs; u32 intr_status = readl(ioaddr + DMA_CHAN_STATUS(dwmac4_addrs, chan)); u32 intr_en = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
- struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan];
- struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan];
- struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); int ret = 0;
if (dir == DMA_DIR_RX) @@ -201,15 +200,15 @@ int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, } /* TX/RX NORMAL interrupts */ if (likely(intr_status & DMA_CHAN_STATUS_RI)) {
u64_stats_update_begin(&rxq_stats->syncp);
rxq_stats->rx_normal_irq_n++;
u64_stats_update_end(&rxq_stats->syncp);
u64_stats_update_begin(&stats->syncp);
u64_stats_inc(&stats->rx_normal_irq_n[chan]);
ret |= handle_rx; } if (likely(intr_status & DMA_CHAN_STATUS_TI)) {u64_stats_update_end(&stats->syncp);
u64_stats_update_begin(&txq_stats->syncp);
txq_stats->tx_normal_irq_n++;
u64_stats_update_end(&txq_stats->syncp);
u64_stats_update_begin(&stats->syncp);
u64_stats_inc(&stats->tx_normal_irq_n[chan]);
ret |= handle_tx; }u64_stats_update_end(&stats->syncp);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c index 7907d62d3437..85e18f9a22f9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c @@ -162,8 +162,7 @@ static void show_rx_process_state(unsigned int status) int dwmac_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, struct stmmac_extra_stats *x, u32 chan, u32 dir) {
- struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan];
- struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan];
- struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); int ret = 0; /* read the status register (CSR5) */ u32 intr_status = readl(ioaddr + DMA_STATUS);
@@ -215,16 +214,16 @@ int dwmac_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, u32 value = readl(ioaddr + DMA_INTR_ENA); /* to schedule NAPI on real RIE event. */ if (likely(value & DMA_INTR_ENA_RIE)) {
u64_stats_update_begin(&rxq_stats->syncp);
rxq_stats->rx_normal_irq_n++;
u64_stats_update_end(&rxq_stats->syncp);
u64_stats_update_begin(&stats->syncp);
u64_stats_inc(&stats->rx_normal_irq_n[chan]);
} if (likely(intr_status & DMA_STATUS_TI)) {u64_stats_update_end(&stats->syncp); ret |= handle_rx; }
u64_stats_update_begin(&txq_stats->syncp);
txq_stats->tx_normal_irq_n++;
u64_stats_update_end(&txq_stats->syncp);
u64_stats_update_begin(&stats->syncp);
u64_stats_inc(&stats->tx_normal_irq_n[chan]);
} if (unlikely(intr_status & DMA_STATUS_ERI))u64_stats_update_end(&stats->syncp); ret |= handle_tx;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 3cde695fec91..dd2ab6185c40 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -337,8 +337,7 @@ static int dwxgmac2_dma_interrupt(struct stmmac_priv *priv, struct stmmac_extra_stats *x, u32 chan, u32 dir) {
- struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan];
- struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan];
- struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); u32 intr_status = readl(ioaddr + XGMAC_DMA_CH_STATUS(chan)); u32 intr_en = readl(ioaddr + XGMAC_DMA_CH_INT_EN(chan)); int ret = 0;
@@ -367,15 +366,15 @@ static int dwxgmac2_dma_interrupt(struct stmmac_priv *priv, /* TX/RX NORMAL interrupts */ if (likely(intr_status & XGMAC_NIS)) { if (likely(intr_status & XGMAC_RI)) {
u64_stats_update_begin(&rxq_stats->syncp);
rxq_stats->rx_normal_irq_n++;
u64_stats_update_end(&rxq_stats->syncp);
u64_stats_update_begin(&stats->syncp);
u64_stats_inc(&stats->rx_normal_irq_n[chan]);
} if (likely(intr_status & (XGMAC_TI | XGMAC_TBU))) {u64_stats_update_end(&stats->syncp); ret |= handle_rx;
u64_stats_update_begin(&txq_stats->syncp);
txq_stats->tx_normal_irq_n++;
u64_stats_update_end(&txq_stats->syncp);
u64_stats_update_begin(&stats->syncp);
u64_stats_inc(&stats->tx_normal_irq_n[chan]);
} }u64_stats_update_end(&stats->syncp); ret |= handle_tx;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 42d27b97dd1d..ec44becf0e2d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -549,44 +549,79 @@ stmmac_set_pauseparam(struct net_device *netdev, } } +static u64 stmmac_get_rx_normal_irq_n(struct stmmac_priv *priv, int q) +{
- u64 total;
- int cpu;
- total = 0;
- for_each_possible_cpu(cpu) {
struct stmmac_pcpu_stats *pcpu;
unsigned int start;
u64 irq_n;
pcpu = per_cpu_ptr(priv->xstats.pcpu_stats, cpu);
do {
start = u64_stats_fetch_begin(&pcpu->syncp);
irq_n = u64_stats_read(&pcpu->rx_normal_irq_n[q]);
} while (u64_stats_fetch_retry(&pcpu->syncp, start));
total += irq_n;
- }
- return total;
+}
+static u64 stmmac_get_tx_normal_irq_n(struct stmmac_priv *priv, int q) +{
- u64 total;
- int cpu;
- total = 0;
- for_each_possible_cpu(cpu) {
struct stmmac_pcpu_stats *pcpu;
unsigned int start;
u64 irq_n;
pcpu = per_cpu_ptr(priv->xstats.pcpu_stats, cpu);
do {
start = u64_stats_fetch_begin(&pcpu->syncp);
irq_n = u64_stats_read(&pcpu->tx_normal_irq_n[q]);
} while (u64_stats_fetch_retry(&pcpu->syncp, start));
total += irq_n;
- }
- return total;
+}
static void stmmac_get_per_qstats(struct stmmac_priv *priv, u64 *data) { u32 tx_cnt = priv->plat->tx_queues_to_use; u32 rx_cnt = priv->plat->rx_queues_to_use; unsigned int start;
- int q, stat;
- char *p;
- int q;
for (q = 0; q < tx_cnt; q++) { struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[q];
struct stmmac_txq_stats snapshot;
u64 pkt_n;
do {
start = u64_stats_fetch_begin(&txq_stats->syncp);
snapshot = *txq_stats;
} while (u64_stats_fetch_retry(&txq_stats->syncp, start));
start = u64_stats_fetch_begin(&txq_stats->napi_syncp);
pkt_n = u64_stats_read(&txq_stats->napi.tx_pkt_n);
} while (u64_stats_fetch_retry(&txq_stats->napi_syncp, start));
p = (char *)&snapshot + offsetof(struct stmmac_txq_stats, tx_pkt_n);
for (stat = 0; stat < STMMAC_TXQ_STATS; stat++) {
*data++ = (*(u64 *)p);
p += sizeof(u64);
}
*data++ = pkt_n;
}*data++ = stmmac_get_tx_normal_irq_n(priv, q);
for (q = 0; q < rx_cnt; q++) { struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[q];
struct stmmac_rxq_stats snapshot;
u64 pkt_n;
do {
start = u64_stats_fetch_begin(&rxq_stats->syncp);
snapshot = *rxq_stats;
} while (u64_stats_fetch_retry(&rxq_stats->syncp, start));
start = u64_stats_fetch_begin(&rxq_stats->napi_syncp);
pkt_n = u64_stats_read(&rxq_stats->napi.rx_pkt_n);
} while (u64_stats_fetch_retry(&rxq_stats->napi_syncp, start));
p = (char *)&snapshot + offsetof(struct stmmac_rxq_stats, rx_pkt_n);
for (stat = 0; stat < STMMAC_RXQ_STATS; stat++) {
*data++ = (*(u64 *)p);
p += sizeof(u64);
}
*data++ = pkt_n;
}*data++ = stmmac_get_rx_normal_irq_n(priv, q);
} @@ -645,39 +680,49 @@ static void stmmac_get_ethtool_stats(struct net_device *dev, pos = j; for (i = 0; i < rx_queues_count; i++) { struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[i];
struct stmmac_rxq_stats snapshot;
struct stmmac_napi_rx_stats snapshot;
u64 n_irq;
j = pos; do {
start = u64_stats_fetch_begin(&rxq_stats->syncp);
snapshot = *rxq_stats;
} while (u64_stats_fetch_retry(&rxq_stats->syncp, start));
data[j++] += snapshot.rx_pkt_n;
data[j++] += snapshot.rx_normal_irq_n;
normal_irq_n += snapshot.rx_normal_irq_n;
napi_poll += snapshot.napi_poll;
start = u64_stats_fetch_begin(&rxq_stats->napi_syncp);
snapshot = rxq_stats->napi;
} while (u64_stats_fetch_retry(&rxq_stats->napi_syncp, start));
data[j++] += u64_stats_read(&snapshot.rx_pkt_n);
n_irq = stmmac_get_rx_normal_irq_n(priv, i);
data[j++] += n_irq;
normal_irq_n += n_irq;
}napi_poll += u64_stats_read(&snapshot.poll);
pos = j; for (i = 0; i < tx_queues_count; i++) { struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[i];
struct stmmac_txq_stats snapshot;
struct stmmac_napi_tx_stats napi_snapshot;
struct stmmac_q_tx_stats q_snapshot;
u64 n_irq;
j = pos; do {
start = u64_stats_fetch_begin(&txq_stats->syncp);
snapshot = *txq_stats;
} while (u64_stats_fetch_retry(&txq_stats->syncp, start));
data[j++] += snapshot.tx_pkt_n;
data[j++] += snapshot.tx_normal_irq_n;
normal_irq_n += snapshot.tx_normal_irq_n;
data[j++] += snapshot.tx_clean;
data[j++] += snapshot.tx_set_ic_bit;
data[j++] += snapshot.tx_tso_frames;
data[j++] += snapshot.tx_tso_nfrags;
napi_poll += snapshot.napi_poll;
start = u64_stats_fetch_begin(&txq_stats->q_syncp);
q_snapshot = txq_stats->q;
} while (u64_stats_fetch_retry(&txq_stats->q_syncp, start));
do {
start = u64_stats_fetch_begin(&txq_stats->napi_syncp);
napi_snapshot = txq_stats->napi;
} while (u64_stats_fetch_retry(&txq_stats->napi_syncp, start));
data[j++] += u64_stats_read(&napi_snapshot.tx_pkt_n);
n_irq = stmmac_get_tx_normal_irq_n(priv, i);
data[j++] += n_irq;
normal_irq_n += n_irq;
data[j++] += u64_stats_read(&napi_snapshot.tx_clean);
data[j++] += u64_stats_read(&q_snapshot.tx_set_ic_bit) +
u64_stats_read(&napi_snapshot.tx_set_ic_bit);
data[j++] += u64_stats_read(&q_snapshot.tx_tso_frames);
data[j++] += u64_stats_read(&q_snapshot.tx_tso_nfrags);
} normal_irq_n += priv->xstats.rx_early_irq; data[j++] = normal_irq_n;napi_poll += u64_stats_read(&napi_snapshot.poll);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 25519952f754..75d029704503 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2482,7 +2482,6 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) struct xdp_desc xdp_desc; bool work_done = true; u32 tx_set_ic_bit = 0;
- unsigned long flags;
/* Avoids TX time-out as we are sharing with slow path */ txq_trans_cond_update(nq); @@ -2566,9 +2565,9 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, priv->dma_conf.dma_tx_size); entry = tx_q->cur_tx; }
- flags = u64_stats_update_begin_irqsave(&txq_stats->syncp);
- txq_stats->tx_set_ic_bit += tx_set_ic_bit;
- u64_stats_update_end_irqrestore(&txq_stats->syncp, flags);
- u64_stats_update_begin(&txq_stats->napi_syncp);
- u64_stats_add(&txq_stats->napi.tx_set_ic_bit, tx_set_ic_bit);
- u64_stats_update_end(&txq_stats->napi_syncp);
if (tx_desc) { stmmac_flush_tx_descriptors(priv, queue); @@ -2616,7 +2615,6 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue, unsigned int bytes_compl = 0, pkts_compl = 0; unsigned int entry, xmits = 0, count = 0; u32 tx_packets = 0, tx_errors = 0;
- unsigned long flags;
__netif_tx_lock_bh(netdev_get_tx_queue(priv->dev, queue)); @@ -2782,11 +2780,11 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue, if (tx_q->dirty_tx != tx_q->cur_tx) *pending_packets = true;
- flags = u64_stats_update_begin_irqsave(&txq_stats->syncp);
- txq_stats->tx_packets += tx_packets;
- txq_stats->tx_pkt_n += tx_packets;
- txq_stats->tx_clean++;
- u64_stats_update_end_irqrestore(&txq_stats->syncp, flags);
- u64_stats_update_begin(&txq_stats->napi_syncp);
- u64_stats_add(&txq_stats->napi.tx_packets, tx_packets);
- u64_stats_add(&txq_stats->napi.tx_pkt_n, tx_packets);
- u64_stats_inc(&txq_stats->napi.tx_clean);
- u64_stats_update_end(&txq_stats->napi_syncp);
priv->xstats.tx_errors += tx_errors; @@ -4213,7 +4211,6 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) struct stmmac_tx_queue *tx_q; bool has_vlan, set_ic; u8 proto_hdr_len, hdr;
- unsigned long flags; u32 pay_len, mss; dma_addr_t des; int i;
@@ -4378,13 +4375,13 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, queue)); }
- flags = u64_stats_update_begin_irqsave(&txq_stats->syncp);
- txq_stats->tx_bytes += skb->len;
- txq_stats->tx_tso_frames++;
- txq_stats->tx_tso_nfrags += nfrags;
- u64_stats_update_begin(&txq_stats->q_syncp);
- u64_stats_add(&txq_stats->q.tx_bytes, skb->len);
- u64_stats_inc(&txq_stats->q.tx_tso_frames);
- u64_stats_add(&txq_stats->q.tx_tso_nfrags, nfrags); if (set_ic)
txq_stats->tx_set_ic_bit++;
- u64_stats_update_end_irqrestore(&txq_stats->syncp, flags);
u64_stats_inc(&txq_stats->q.tx_set_ic_bit);
- u64_stats_update_end(&txq_stats->q_syncp);
if (priv->sarc_type) stmmac_set_desc_sarc(priv, first, priv->sarc_type); @@ -4483,7 +4480,6 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) struct stmmac_tx_queue *tx_q; bool has_vlan, set_ic; int entry, first_tx;
- unsigned long flags; dma_addr_t des;
tx_q = &priv->dma_conf.tx_queue[queue]; @@ -4653,11 +4649,11 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, queue)); }
- flags = u64_stats_update_begin_irqsave(&txq_stats->syncp);
- txq_stats->tx_bytes += skb->len;
- u64_stats_update_begin(&txq_stats->q_syncp);
- u64_stats_add(&txq_stats->q.tx_bytes, skb->len); if (set_ic)
txq_stats->tx_set_ic_bit++;
- u64_stats_update_end_irqrestore(&txq_stats->syncp, flags);
u64_stats_inc(&txq_stats->q.tx_set_ic_bit);
- u64_stats_update_end(&txq_stats->q_syncp);
if (priv->sarc_type) stmmac_set_desc_sarc(priv, first, priv->sarc_type); @@ -4921,12 +4917,11 @@ static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue, set_ic = false; if (set_ic) {
tx_q->tx_count_frames = 0; stmmac_set_tx_ic(priv, tx_desc);unsigned long flags;
flags = u64_stats_update_begin_irqsave(&txq_stats->syncp);
txq_stats->tx_set_ic_bit++;
u64_stats_update_end_irqrestore(&txq_stats->syncp, flags);
u64_stats_update_begin(&txq_stats->q_syncp);
u64_stats_inc(&txq_stats->q.tx_set_ic_bit);
}u64_stats_update_end(&txq_stats->q_syncp);
stmmac_enable_dma_transmission(priv, priv->ioaddr); @@ -5076,7 +5071,6 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue, unsigned int len = xdp->data_end - xdp->data; enum pkt_hash_types hash_type; int coe = priv->hw->rx_csum;
- unsigned long flags; struct sk_buff *skb; u32 hash;
@@ -5106,10 +5100,10 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue, skb_record_rx_queue(skb, queue); napi_gro_receive(&ch->rxtx_napi, skb);
- flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp);
- rxq_stats->rx_pkt_n++;
- rxq_stats->rx_bytes += len;
- u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags);
- u64_stats_update_begin(&rxq_stats->napi_syncp);
- u64_stats_inc(&rxq_stats->napi.rx_pkt_n);
- u64_stats_add(&rxq_stats->napi.rx_bytes, len);
- u64_stats_update_end(&rxq_stats->napi_syncp);
} static bool stmmac_rx_refill_zc(struct stmmac_priv *priv, u32 queue, u32 budget) @@ -5191,7 +5185,6 @@ static int stmmac_rx_zc(struct stmmac_priv *priv, int limit, u32 queue) unsigned int desc_size; struct bpf_prog *prog; bool failure = false;
- unsigned long flags; int xdp_status = 0; int status = 0;
@@ -5346,9 +5339,9 @@ static int stmmac_rx_zc(struct stmmac_priv *priv, int limit, u32 queue) stmmac_finalize_xdp_rx(priv, xdp_status);
- flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp);
- rxq_stats->rx_pkt_n += count;
- u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags);
- u64_stats_update_begin(&rxq_stats->napi_syncp);
- u64_stats_add(&rxq_stats->napi.rx_pkt_n, count);
- u64_stats_update_end(&rxq_stats->napi_syncp);
priv->xstats.rx_dropped += rx_dropped; priv->xstats.rx_errors += rx_errors; @@ -5386,7 +5379,6 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) unsigned int desc_size; struct sk_buff *skb = NULL; struct stmmac_xdp_buff ctx;
- unsigned long flags; int xdp_status = 0; int buf_sz;
@@ -5646,11 +5638,11 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) stmmac_rx_refill(priv, queue);
- flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp);
- rxq_stats->rx_packets += rx_packets;
- rxq_stats->rx_bytes += rx_bytes;
- rxq_stats->rx_pkt_n += count;
- u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags);
- u64_stats_update_begin(&rxq_stats->napi_syncp);
- u64_stats_add(&rxq_stats->napi.rx_packets, rx_packets);
- u64_stats_add(&rxq_stats->napi.rx_bytes, rx_bytes);
- u64_stats_add(&rxq_stats->napi.rx_pkt_n, count);
- u64_stats_update_end(&rxq_stats->napi_syncp);
priv->xstats.rx_dropped += rx_dropped; priv->xstats.rx_errors += rx_errors; @@ -5665,13 +5657,12 @@ static int stmmac_napi_poll_rx(struct napi_struct *napi, int budget) struct stmmac_priv *priv = ch->priv_data; struct stmmac_rxq_stats *rxq_stats; u32 chan = ch->index;
- unsigned long flags; int work_done;
rxq_stats = &priv->xstats.rxq_stats[chan];
- flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp);
- rxq_stats->napi_poll++;
- u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags);
- u64_stats_update_begin(&rxq_stats->napi_syncp);
- u64_stats_inc(&rxq_stats->napi.poll);
- u64_stats_update_end(&rxq_stats->napi_syncp);
work_done = stmmac_rx(priv, budget, chan); if (work_done < budget && napi_complete_done(napi, work_done)) { @@ -5693,13 +5684,12 @@ static int stmmac_napi_poll_tx(struct napi_struct *napi, int budget) struct stmmac_txq_stats *txq_stats; bool pending_packets = false; u32 chan = ch->index;
- unsigned long flags; int work_done;
txq_stats = &priv->xstats.txq_stats[chan];
- flags = u64_stats_update_begin_irqsave(&txq_stats->syncp);
- txq_stats->napi_poll++;
- u64_stats_update_end_irqrestore(&txq_stats->syncp, flags);
- u64_stats_update_begin(&txq_stats->napi_syncp);
- u64_stats_inc(&txq_stats->napi.poll);
- u64_stats_update_end(&txq_stats->napi_syncp);
work_done = stmmac_tx_clean(priv, budget, chan, &pending_packets); work_done = min(work_done, budget); @@ -5729,17 +5719,16 @@ static int stmmac_napi_poll_rxtx(struct napi_struct *napi, int budget) struct stmmac_rxq_stats *rxq_stats; struct stmmac_txq_stats *txq_stats; u32 chan = ch->index;
- unsigned long flags;
rxq_stats = &priv->xstats.rxq_stats[chan];
- flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp);
- rxq_stats->napi_poll++;
- u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags);
- u64_stats_update_begin(&rxq_stats->napi_syncp);
- u64_stats_inc(&rxq_stats->napi.poll);
- u64_stats_update_end(&rxq_stats->napi_syncp);
txq_stats = &priv->xstats.txq_stats[chan];
- flags = u64_stats_update_begin_irqsave(&txq_stats->syncp);
- txq_stats->napi_poll++;
- u64_stats_update_end_irqrestore(&txq_stats->syncp, flags);
- u64_stats_update_begin(&txq_stats->napi_syncp);
- u64_stats_inc(&txq_stats->napi.poll);
- u64_stats_update_end(&txq_stats->napi_syncp);
tx_done = stmmac_tx_clean(priv, budget, chan, &tx_pending_packets); tx_done = min(tx_done, budget); @@ -7065,10 +7054,13 @@ static void stmmac_get_stats64(struct net_device *dev, struct rtnl_link_stats64 u64 tx_bytes; do {
start = u64_stats_fetch_begin(&txq_stats->syncp);
tx_packets = txq_stats->tx_packets;
tx_bytes = txq_stats->tx_bytes;
} while (u64_stats_fetch_retry(&txq_stats->syncp, start));
start = u64_stats_fetch_begin(&txq_stats->q_syncp);
tx_bytes = u64_stats_read(&txq_stats->q.tx_bytes);
} while (u64_stats_fetch_retry(&txq_stats->q_syncp, start));
do {
start = u64_stats_fetch_begin(&txq_stats->napi_syncp);
tx_packets = u64_stats_read(&txq_stats->napi.tx_packets);
} while (u64_stats_fetch_retry(&txq_stats->napi_syncp, start));
stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; @@ -7080,10 +7072,10 @@ static void stmmac_get_stats64(struct net_device *dev, struct rtnl_link_stats64 u64 rx_bytes; do {
start = u64_stats_fetch_begin(&rxq_stats->syncp);
rx_packets = rxq_stats->rx_packets;
rx_bytes = rxq_stats->rx_bytes;
} while (u64_stats_fetch_retry(&rxq_stats->syncp, start));
start = u64_stats_fetch_begin(&rxq_stats->napi_syncp);
rx_packets = u64_stats_read(&rxq_stats->napi.rx_packets);
rx_bytes = u64_stats_read(&rxq_stats->napi.rx_bytes);
} while (u64_stats_fetch_retry(&rxq_stats->napi_syncp, start));
stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; @@ -7477,9 +7469,16 @@ int stmmac_dvr_probe(struct device *device, priv->dev = ndev; for (i = 0; i < MTL_MAX_RX_QUEUES; i++)
u64_stats_init(&priv->xstats.rxq_stats[i].syncp);
- for (i = 0; i < MTL_MAX_TX_QUEUES; i++)
u64_stats_init(&priv->xstats.txq_stats[i].syncp);
u64_stats_init(&priv->xstats.rxq_stats[i].napi_syncp);
- for (i = 0; i < MTL_MAX_TX_QUEUES; i++) {
u64_stats_init(&priv->xstats.txq_stats[i].q_syncp);
u64_stats_init(&priv->xstats.txq_stats[i].napi_syncp);
- }
- priv->xstats.pcpu_stats =
devm_netdev_alloc_pcpu_stats(device, struct stmmac_pcpu_stats);
- if (!priv->xstats.pcpu_stats)
return -ENOMEM;
stmmac_set_ethtool_ops(ndev); priv->pause = pause; -- 2.43.0
Hello:
This patch was applied to netdev/net.git (main) by David S. Miller davem@davemloft.net:
On Sat, 3 Feb 2024 20:09:27 +0100 you wrote:
As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
[...]
Here is the summary with links: - [net,v3] net: stmmac: protect updates of 64-bit statistics counters https://git.kernel.org/netdev/net/c/38cc3c6dcc09
You are awesome, thank you!
Hi,
On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote:
As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
- fields updated only under the tx queue lock,
- fields updated only during NAPI poll,
- fields updated only from interrupt context,
Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own.
Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats().
For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats.
Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik petr@tesarici.cz
This patch results in a lockdep splat. Backtrace and bisect results attached.
Guenter
--- [ 33.736728] ================================ [ 33.736805] WARNING: inconsistent lock state [ 33.736953] 6.8.0-rc4 #1 Tainted: G N [ 33.737080] -------------------------------- [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.738206] {HARDIRQ-ON-W} state was registered at: [ 33.738318] lock_acquire+0x11c/0x368 [ 33.738431] __u64_stats_update_begin+0x104/0x1ac [ 33.738525] stmmac_xmit+0x4d0/0xc58 [ 33.738605] dev_hard_start_xmit+0xc4/0x2a0 [ 33.738689] sch_direct_xmit+0xf8/0x30c [ 33.738763] __dev_queue_xmit+0x400/0xcc4 [ 33.738831] ip6_finish_output2+0x254/0xafc [ 33.738903] mld_sendpack+0x260/0x5b0 [ 33.738969] mld_ifc_work+0x274/0x588 [ 33.739032] process_one_work+0x230/0x604 [ 33.739101] worker_thread+0x1dc/0x494 [ 33.739165] kthread+0x100/0x120 [ 33.739225] ret_from_fork+0x14/0x28 [ 33.739302] irq event stamp: 3553 [ 33.739371] hardirqs last enabled at (3552): [<c03e884c>] __call_rcu_common.constprop.0+0x1a4/0x6b4 [ 33.739515] hardirqs last disabled at (3553): [<c0300bd4>] __irq_svc+0x54/0xb8 [ 33.739638] softirqs last enabled at (3542): [<c1254a60>] neigh_resolve_output+0x1fc/0x254 [ 33.739795] softirqs last disabled at (3546): [<c1243798>] __dev_queue_xmit+0x48/0xcc4 [ 33.739919] [ 33.739919] other info that might help us debug this: [ 33.740021] Possible unsafe locking scenario: [ 33.740021] [ 33.740111] CPU0 [ 33.740158] ---- [ 33.740204] lock(&syncp->seq#2); [ 33.740314] <Interrupt> [ 33.740363] lock(&syncp->seq#2); [ 33.740511] [ 33.740511] *** DEADLOCK *** [ 33.740511] [ 33.740665] 8 locks held by kworker/0:2/39: [ 33.740761] #0: c4bfb2a8 ((wq_completion)mld){+.+.}-{0:0}, at: process_one_work+0x168/0x604 [ 33.741025] #1: f0909f20 ((work_completion)(&(&idev->mc_ifc_work)->work)){+.+.}-{0:0}, at: process_one_work+0x168/0x604 [ 33.741230] #2: c328baac (&idev->mc_lock){+.+.}-{3:3}, at: mld_ifc_work+0x24/0x588 [ 33.741387] #3: c2191488 (rcu_read_lock){....}-{1:2}, at: mld_sendpack+0x0/0x5b0 [ 33.741553] #4: c2191488 (rcu_read_lock){....}-{1:2}, at: ip6_finish_output2+0x174/0xafc [ 33.741716] #5: c219149c (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x48/0xcc4 [ 33.741877] #6: c4d3a974 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x334/0xcc4 [ 33.742070] #7: c49e5050 (_xmit_ETHER#2){+...}-{2:2}, at: sch_direct_xmit+0x158/0x30c [ 33.742250] [ 33.742250] stack backtrace: [ 33.742426] CPU: 0 PID: 39 Comm: kworker/0:2 Tainted: G N 6.8.0-rc4 #1 [ 33.742578] Hardware name: Allwinner sun8i Family [ 33.742776] Workqueue: mld mld_ifc_work [ 33.742998] unwind_backtrace from show_stack+0x10/0x14 [ 33.743119] show_stack from dump_stack_lvl+0x68/0x90 [ 33.743232] dump_stack_lvl from mark_lock.part.0+0xbd8/0x12d8 [ 33.743345] mark_lock.part.0 from __lock_acquire+0xad4/0x224c [ 33.743458] __lock_acquire from lock_acquire+0x11c/0x368 [ 33.743564] lock_acquire from __u64_stats_update_begin+0x104/0x1ac [ 33.743683] __u64_stats_update_begin from sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.743805] sun8i_dwmac_dma_interrupt from stmmac_napi_check+0x40/0x1c8 [ 33.743917] stmmac_napi_check from stmmac_interrupt+0xa4/0x154 [ 33.744020] stmmac_interrupt from __handle_irq_event_percpu+0xcc/0x2ec [ 33.744134] __handle_irq_event_percpu from handle_irq_event+0x38/0x80 [ 33.744243] handle_irq_event from handle_fasteoi_irq+0x9c/0x1c4 [ 33.744346] handle_fasteoi_irq from generic_handle_domain_irq+0x28/0x38 [ 33.744459] generic_handle_domain_irq from gic_handle_irq+0x98/0xcc [ 33.744567] gic_handle_irq from generic_handle_arch_irq+0x34/0x44 [ 33.744673] generic_handle_arch_irq from call_with_stack+0x18/0x20 [ 33.744831] call_with_stack from __irq_svc+0x9c/0xb8 [ 33.745018] Exception stack(0xf0909c00 to 0xf0909c48) [ 33.745221] 9c00: f0ab0000 c49e506c 0000005a 00000000 c0000006 f0ab0014 0000005a c0f5da68 [ 33.745387] 9c20: c35bd810 c4b50000 c4b50000 c365d300 00000000 f0909c50 c0f70a70 c0f70a74 [ 33.745574] 9c40: 60000013 ffffffff [ 33.745668] __irq_svc from sun8i_dwmac_enable_dma_transmission+0x20/0x24 [ 33.745809] sun8i_dwmac_enable_dma_transmission from stmmac_xmit+0x790/0xc58 [ 33.745975] stmmac_xmit from dev_hard_start_xmit+0xc4/0x2a0 [ 33.746100] dev_hard_start_xmit from sch_direct_xmit+0xf8/0x30c [ 33.746220] sch_direct_xmit from __dev_queue_xmit+0x400/0xcc4 [ 33.746350] __dev_queue_xmit from ip6_finish_output2+0x254/0xafc [ 33.746462] ip6_finish_output2 from mld_sendpack+0x260/0x5b0 [ 33.746568] mld_sendpack from mld_ifc_work+0x274/0x588 [ 33.746670] mld_ifc_work from process_one_work+0x230/0x604 [ 33.746793] process_one_work from worker_thread+0x1dc/0x494 [ 33.746906] worker_thread from kthread+0x100/0x120 [ 33.746994] kthread from ret_from_fork+0x14/0x28 [ 33.747076] Exception stack(0xf0909fb0 to 0xf0909ff8) [ 33.747165] 9fa0: 00000000 00000000 00000000 00000000 [ 33.747303] 9fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 33.747433] 9fe0: 00000000 00000000 00000000 00000000 00000013 00000000
--- # bad: [841c35169323cd833294798e58b9bf63fa4fa1de] Linux 6.8-rc4 # good: [54be6c6c5ae8e0d93a6c4641cb7528eb0b6ba478] Linux 6.8-rc3 git bisect start 'HEAD' 'v6.8-rc3' # bad: [c76b766ec50d3d43e2dacea53a733b285f4b730d] Merge tag 'drm-fixes-2024-02-09' of git://anongit.freedesktop.org/drm/drm git bisect bad c76b766ec50d3d43e2dacea53a733b285f4b730d # bad: [63e4b9d693e0f8c28359c7ea81e1ee510864c37b] Merge tag 'nf-24-02-08' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf git bisect bad 63e4b9d693e0f8c28359c7ea81e1ee510864c37b # bad: [75428f537d7cae33c7e4dd726144074f78622c09] net: intel: fix old compiler regressions git bisect bad 75428f537d7cae33c7e4dd726144074f78622c09 # good: [1a1c13303ff6d64e6f718dc8aa614e580ca8d9b4] nfp: flower: prevent re-adding mac index for bonded port git bisect good 1a1c13303ff6d64e6f718dc8aa614e580ca8d9b4 # good: [3871aa01e1a779d866fa9dfdd5a836f342f4eb87] tipc: Check the bearer type before calling tipc_udp_nl_bearer_add() git bisect good 3871aa01e1a779d866fa9dfdd5a836f342f4eb87 # good: [58086721b7781c3e35b19c9b78c8f5a791070ba3] devlink: avoid potential loop in devlink_rel_nested_in_notify_work() git bisect good 58086721b7781c3e35b19c9b78c8f5a791070ba3 # bad: [38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8] net: stmmac: protect updates of 64-bit statistics counters git bisect bad 38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8 # good: [cb88cb53badb8aeb3955ad6ce80b07b598e310b8] ppp_async: limit MRU to 64K git bisect good cb88cb53badb8aeb3955ad6ce80b07b598e310b8 # first bad commit: [38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8] net: stmmac: protect updates of 64-bit statistics counters
On Sun, Feb 11, 2024 at 08:30:21PM -0800, Guenter Roeck wrote:
Hi,
On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote:
As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
- fields updated only under the tx queue lock,
- fields updated only during NAPI poll,
- fields updated only from interrupt context,
Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own.
Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats().
For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats.
Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik petr@tesarici.cz
This patch results in a lockdep splat. Backtrace and bisect results attached.
Guenter
[ 33.736728] ================================ [ 33.736805] WARNING: inconsistent lock state [ 33.736953] 6.8.0-rc4 #1 Tainted: G N [ 33.737080] -------------------------------- [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.738206] {HARDIRQ-ON-W} state was registered at: [ 33.738318] lock_acquire+0x11c/0x368 [ 33.738431] __u64_stats_update_begin+0x104/0x1ac [ 33.738525] stmmac_xmit+0x4d0/0xc58
interesting lockdep splat... stmmac_xmit() operates on txq_stats->q_syncp, while the sun8i_dwmac_dma_interrupt() operates on pcpu's priv->xstats.pcpu_stats they are different syncp. so how does lockdep splat happen.
[ 33.738605] dev_hard_start_xmit+0xc4/0x2a0 [ 33.738689] sch_direct_xmit+0xf8/0x30c [ 33.738763] __dev_queue_xmit+0x400/0xcc4 [ 33.738831] ip6_finish_output2+0x254/0xafc [ 33.738903] mld_sendpack+0x260/0x5b0 [ 33.738969] mld_ifc_work+0x274/0x588 [ 33.739032] process_one_work+0x230/0x604 [ 33.739101] worker_thread+0x1dc/0x494 [ 33.739165] kthread+0x100/0x120 [ 33.739225] ret_from_fork+0x14/0x28 [ 33.739302] irq event stamp: 3553 [ 33.739371] hardirqs last enabled at (3552): [<c03e884c>] __call_rcu_common.constprop.0+0x1a4/0x6b4 [ 33.739515] hardirqs last disabled at (3553): [<c0300bd4>] __irq_svc+0x54/0xb8 [ 33.739638] softirqs last enabled at (3542): [<c1254a60>] neigh_resolve_output+0x1fc/0x254 [ 33.739795] softirqs last disabled at (3546): [<c1243798>] __dev_queue_xmit+0x48/0xcc4 [ 33.739919] [ 33.739919] other info that might help us debug this: [ 33.740021] Possible unsafe locking scenario: [ 33.740021] [ 33.740111] CPU0 [ 33.740158] ---- [ 33.740204] lock(&syncp->seq#2); [ 33.740314] <Interrupt> [ 33.740363] lock(&syncp->seq#2); [ 33.740511] [ 33.740511] *** DEADLOCK *** [ 33.740511] [ 33.740665] 8 locks held by kworker/0:2/39: [ 33.740761] #0: c4bfb2a8 ((wq_completion)mld){+.+.}-{0:0}, at: process_one_work+0x168/0x604 [ 33.741025] #1: f0909f20 ((work_completion)(&(&idev->mc_ifc_work)->work)){+.+.}-{0:0}, at: process_one_work+0x168/0x604 [ 33.741230] #2: c328baac (&idev->mc_lock){+.+.}-{3:3}, at: mld_ifc_work+0x24/0x588 [ 33.741387] #3: c2191488 (rcu_read_lock){....}-{1:2}, at: mld_sendpack+0x0/0x5b0 [ 33.741553] #4: c2191488 (rcu_read_lock){....}-{1:2}, at: ip6_finish_output2+0x174/0xafc [ 33.741716] #5: c219149c (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x48/0xcc4 [ 33.741877] #6: c4d3a974 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x334/0xcc4 [ 33.742070] #7: c49e5050 (_xmit_ETHER#2){+...}-{2:2}, at: sch_direct_xmit+0x158/0x30c [ 33.742250] [ 33.742250] stack backtrace: [ 33.742426] CPU: 0 PID: 39 Comm: kworker/0:2 Tainted: G N 6.8.0-rc4 #1 [ 33.742578] Hardware name: Allwinner sun8i Family [ 33.742776] Workqueue: mld mld_ifc_work [ 33.742998] unwind_backtrace from show_stack+0x10/0x14 [ 33.743119] show_stack from dump_stack_lvl+0x68/0x90 [ 33.743232] dump_stack_lvl from mark_lock.part.0+0xbd8/0x12d8 [ 33.743345] mark_lock.part.0 from __lock_acquire+0xad4/0x224c [ 33.743458] __lock_acquire from lock_acquire+0x11c/0x368 [ 33.743564] lock_acquire from __u64_stats_update_begin+0x104/0x1ac [ 33.743683] __u64_stats_update_begin from sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.743805] sun8i_dwmac_dma_interrupt from stmmac_napi_check+0x40/0x1c8 [ 33.743917] stmmac_napi_check from stmmac_interrupt+0xa4/0x154 [ 33.744020] stmmac_interrupt from __handle_irq_event_percpu+0xcc/0x2ec [ 33.744134] __handle_irq_event_percpu from handle_irq_event+0x38/0x80 [ 33.744243] handle_irq_event from handle_fasteoi_irq+0x9c/0x1c4 [ 33.744346] handle_fasteoi_irq from generic_handle_domain_irq+0x28/0x38 [ 33.744459] generic_handle_domain_irq from gic_handle_irq+0x98/0xcc [ 33.744567] gic_handle_irq from generic_handle_arch_irq+0x34/0x44 [ 33.744673] generic_handle_arch_irq from call_with_stack+0x18/0x20 [ 33.744831] call_with_stack from __irq_svc+0x9c/0xb8 [ 33.745018] Exception stack(0xf0909c00 to 0xf0909c48) [ 33.745221] 9c00: f0ab0000 c49e506c 0000005a 00000000 c0000006 f0ab0014 0000005a c0f5da68 [ 33.745387] 9c20: c35bd810 c4b50000 c4b50000 c365d300 00000000 f0909c50 c0f70a70 c0f70a74 [ 33.745574] 9c40: 60000013 ffffffff [ 33.745668] __irq_svc from sun8i_dwmac_enable_dma_transmission+0x20/0x24 [ 33.745809] sun8i_dwmac_enable_dma_transmission from stmmac_xmit+0x790/0xc58 [ 33.745975] stmmac_xmit from dev_hard_start_xmit+0xc4/0x2a0 [ 33.746100] dev_hard_start_xmit from sch_direct_xmit+0xf8/0x30c [ 33.746220] sch_direct_xmit from __dev_queue_xmit+0x400/0xcc4 [ 33.746350] __dev_queue_xmit from ip6_finish_output2+0x254/0xafc [ 33.746462] ip6_finish_output2 from mld_sendpack+0x260/0x5b0 [ 33.746568] mld_sendpack from mld_ifc_work+0x274/0x588 [ 33.746670] mld_ifc_work from process_one_work+0x230/0x604 [ 33.746793] process_one_work from worker_thread+0x1dc/0x494 [ 33.746906] worker_thread from kthread+0x100/0x120 [ 33.746994] kthread from ret_from_fork+0x14/0x28 [ 33.747076] Exception stack(0xf0909fb0 to 0xf0909ff8) [ 33.747165] 9fa0: 00000000 00000000 00000000 00000000 [ 33.747303] 9fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 33.747433] 9fe0: 00000000 00000000 00000000 00000000 00000013 00000000
# bad: [841c35169323cd833294798e58b9bf63fa4fa1de] Linux 6.8-rc4 # good: [54be6c6c5ae8e0d93a6c4641cb7528eb0b6ba478] Linux 6.8-rc3 git bisect start 'HEAD' 'v6.8-rc3' # bad: [c76b766ec50d3d43e2dacea53a733b285f4b730d] Merge tag 'drm-fixes-2024-02-09' of git://anongit.freedesktop.org/drm/drm git bisect bad c76b766ec50d3d43e2dacea53a733b285f4b730d # bad: [63e4b9d693e0f8c28359c7ea81e1ee510864c37b] Merge tag 'nf-24-02-08' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf git bisect bad 63e4b9d693e0f8c28359c7ea81e1ee510864c37b # bad: [75428f537d7cae33c7e4dd726144074f78622c09] net: intel: fix old compiler regressions git bisect bad 75428f537d7cae33c7e4dd726144074f78622c09 # good: [1a1c13303ff6d64e6f718dc8aa614e580ca8d9b4] nfp: flower: prevent re-adding mac index for bonded port git bisect good 1a1c13303ff6d64e6f718dc8aa614e580ca8d9b4 # good: [3871aa01e1a779d866fa9dfdd5a836f342f4eb87] tipc: Check the bearer type before calling tipc_udp_nl_bearer_add() git bisect good 3871aa01e1a779d866fa9dfdd5a836f342f4eb87 # good: [58086721b7781c3e35b19c9b78c8f5a791070ba3] devlink: avoid potential loop in devlink_rel_nested_in_notify_work() git bisect good 58086721b7781c3e35b19c9b78c8f5a791070ba3 # bad: [38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8] net: stmmac: protect updates of 64-bit statistics counters git bisect bad 38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8 # good: [cb88cb53badb8aeb3955ad6ce80b07b598e310b8] ppp_async: limit MRU to 64K git bisect good cb88cb53badb8aeb3955ad6ce80b07b598e310b8 # first bad commit: [38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8] net: stmmac: protect updates of 64-bit statistics counters
On Tue, Feb 13, 2024 at 3:29 PM Jisheng Zhang jszhang@kernel.org wrote:
On Sun, Feb 11, 2024 at 08:30:21PM -0800, Guenter Roeck wrote:
Hi,
On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote:
As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
- fields updated only under the tx queue lock,
- fields updated only during NAPI poll,
- fields updated only from interrupt context,
Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own.
Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats().
For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats.
Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik petr@tesarici.cz
This patch results in a lockdep splat. Backtrace and bisect results attached.
Guenter
[ 33.736728] ================================ [ 33.736805] WARNING: inconsistent lock state [ 33.736953] 6.8.0-rc4 #1 Tainted: G N [ 33.737080] -------------------------------- [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.738206] {HARDIRQ-ON-W} state was registered at: [ 33.738318] lock_acquire+0x11c/0x368 [ 33.738431] __u64_stats_update_begin+0x104/0x1ac [ 33.738525] stmmac_xmit+0x4d0/0xc58
interesting lockdep splat... stmmac_xmit() operates on txq_stats->q_syncp, while the sun8i_dwmac_dma_interrupt() operates on pcpu's priv->xstats.pcpu_stats they are different syncp. so how does lockdep splat happen.
Right, I do not see anything obvious yet.
On Tue, Feb 13, 2024 at 03:51:35PM +0100, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 3:29 PM Jisheng Zhang jszhang@kernel.org wrote:
On Sun, Feb 11, 2024 at 08:30:21PM -0800, Guenter Roeck wrote:
Hi,
On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote:
As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
- fields updated only under the tx queue lock,
- fields updated only during NAPI poll,
- fields updated only from interrupt context,
Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own.
Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats().
For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats.
Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik petr@tesarici.cz
This patch results in a lockdep splat. Backtrace and bisect results attached.
Guenter
[ 33.736728] ================================ [ 33.736805] WARNING: inconsistent lock state [ 33.736953] 6.8.0-rc4 #1 Tainted: G N [ 33.737080] -------------------------------- [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.738206] {HARDIRQ-ON-W} state was registered at: [ 33.738318] lock_acquire+0x11c/0x368 [ 33.738431] __u64_stats_update_begin+0x104/0x1ac [ 33.738525] stmmac_xmit+0x4d0/0xc58
interesting lockdep splat... stmmac_xmit() operates on txq_stats->q_syncp, while the sun8i_dwmac_dma_interrupt() operates on pcpu's priv->xstats.pcpu_stats they are different syncp. so how does lockdep splat happen.
Right, I do not see anything obvious yet.
Wild guess: I think it maybe saying that due to
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
the critical code may somehow be interrupted and, while handling the interrupt, try to acquire the same lock again.
Guenter
On Tue, Feb 13, 2024 at 4:26 PM Guenter Roeck linux@roeck-us.net wrote:
On Tue, Feb 13, 2024 at 03:51:35PM +0100, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 3:29 PM Jisheng Zhang jszhang@kernel.org wrote:
On Sun, Feb 11, 2024 at 08:30:21PM -0800, Guenter Roeck wrote:
Hi,
On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote:
As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
- fields updated only under the tx queue lock,
- fields updated only during NAPI poll,
- fields updated only from interrupt context,
Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own.
Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats().
For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats.
Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik petr@tesarici.cz
This patch results in a lockdep splat. Backtrace and bisect results attached.
Guenter
[ 33.736728] ================================ [ 33.736805] WARNING: inconsistent lock state [ 33.736953] 6.8.0-rc4 #1 Tainted: G N [ 33.737080] -------------------------------- [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.738206] {HARDIRQ-ON-W} state was registered at: [ 33.738318] lock_acquire+0x11c/0x368 [ 33.738431] __u64_stats_update_begin+0x104/0x1ac [ 33.738525] stmmac_xmit+0x4d0/0xc58
interesting lockdep splat... stmmac_xmit() operates on txq_stats->q_syncp, while the sun8i_dwmac_dma_interrupt() operates on pcpu's priv->xstats.pcpu_stats they are different syncp. so how does lockdep splat happen.
Right, I do not see anything obvious yet.
Wild guess: I think it maybe saying that due to
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
the critical code may somehow be interrupted and, while handling the interrupt, try to acquire the same lock again.
This should not happen, the 'syncp' are different. They have different lockdep classes.
One is exclusively used from hard irq context.
The second one only used from BH context.
Hello, FWIW I'm seeing this splat too on STM32MP157 with 6.8.0-rc5 (from wireless tree). It happens systematically a few seconds after link up
[ 27.884703] ================================ [ 27.888988] WARNING: inconsistent lock state [ 27.893271] 6.8.0-rc5-g59460f7f45e6-dirty #16 Not tainted [ 27.898671] -------------------------------- [ 27.902951] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 27.908954] swapper/0/0 [HC1[1]:SC0[0]:HE0:SE1] takes: [ 27.914155] d7b764ac (&syncp->seq#3){?.-.}-{0:0}, at: dwmac4_dma_interrupt+0xc4/0x2a8 [ 27.921974] {HARDIRQ-ON-W} state was registered at: [ 27.926863] lock_acquire+0x12c/0x388 [ 27.930563] __u64_stats_update_begin+0x138/0x214 [ 27.935372] stmmac_xmit+0x55c/0xd80 [ 27.939064] dev_hard_start_xmit+0xec/0x2f4 [ 27.943362] sch_direct_xmit+0x94/0x310 [ 27.947255] __dev_queue_xmit+0x3f8/0xd04 [ 27.951347] ip6_finish_output2+0x2fc/0xbc0 [ 27.955642] mld_sendpack+0x268/0x594 [ 27.959329] mld_ifc_work+0x268/0x568 [ 27.963115] process_one_work+0x20c/0x618 [ 27.967216] worker_thread+0x1e8/0x4ac [ 27.971009] kthread+0x110/0x130 [ 27.974296] ret_from_fork+0x14/0x28 [ 27.977982] irq event stamp: 12456 [ 27.981353] hardirqs last enabled at (12455): [<c08e3558>] default_idle_call+0x1c/0x2cc [ 27.989507] hardirqs last disabled at (12456): [<c0100b74>] __irq_svc+0x54/0xd0 [ 27.996844] softirqs last enabled at (12440): [<c010162c>] __do_softirq+0x318/0x4dc [ 28.004586] softirqs last disabled at (12429): [<c012b2a8>] __irq_exit_rcu+0x130/0x184 [ 28.012530] [ 28.012530] other info that might help us debug this: [ 28.019040] Possible unsafe locking scenario: [ 28.019040] [ 28.025043] CPU0 [ 28.027400] ---- [ 28.029857] lock(&syncp->seq#3); [ 28.033253] <Interrupt> [ 28.035912] lock(&syncp->seq#3); [ 28.039410] [ 28.039410] *** DEADLOCK *** [ 28.039410] [ 28.045416] no locks held by swapper/0/0. [ 28.049395] [ 28.049395] stack backtrace: [ 28.053781] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.8.0-rc5-g59460f7f45e6-dirty #16 [ 28.061819] Hardware name: STM32 (Device Tree Support) [ 28.066918] unwind_backtrace from show_stack+0x18/0x1c [ 28.072140] show_stack from dump_stack_lvl+0x58/0x70 [ 28.077253] dump_stack_lvl from mark_lock+0xc40/0x12fc [ 28.082478] mark_lock from __lock_acquire+0x968/0x2c20 [ 28.087703] __lock_acquire from lock_acquire+0x12c/0x388 [ 28.093131] lock_acquire from __u64_stats_update_begin+0x138/0x214 [ 28.099372] __u64_stats_update_begin from dwmac4_dma_interrupt+0xc4/0x2a8 [ 28.106219] dwmac4_dma_interrupt from stmmac_napi_check+0x48/0x1d0 [ 28.112558] stmmac_napi_check from stmmac_interrupt+0xa4/0x184 [ 28.118490] stmmac_interrupt from __handle_irq_event_percpu+0xb0/0x308 [ 28.125036] __handle_irq_event_percpu from handle_irq_event+0x40/0x88 [ 28.131578] handle_irq_event from handle_fasteoi_irq+0xa4/0x258 [ 28.137610] handle_fasteoi_irq from generic_handle_domain_irq+0x30/0x40 [ 28.144348] generic_handle_domain_irq from gic_handle_irq+0x7c/0x90 [ 28.150682] gic_handle_irq from generic_handle_arch_irq+0x34/0x44 [ 28.156911] generic_handle_arch_irq from __irq_svc+0x8c/0xd0 [ 28.162631] Exception stack(0xc2201f30 to 0xc2201f78) [ 28.167732] 1f20: ffffffff ffffffff 00000001 000030a7 [ 28.175974] 1f40: c220c780 c0178dc4 c2208d54 c22c2e10 00000000 00000000 c0b06d28 c220c22c [ 28.184114] 1f60: 00000000 c2201f80 c08e3558 c08e355c 600f0013 ffffffff [ 28.190727] __irq_svc from default_idle_call+0x20/0x2cc [ 28.196045] default_idle_call from do_idle+0xd8/0x144 [ 28.201165] do_idle from cpu_startup_entry+0x30/0x34 [ 28.206181] cpu_startup_entry from rest_init+0xf4/0x198 [ 28.211502] rest_init from arch_post_acpi_subsys_init+0x0/0x18
Net maintainers, chiming in here, as it seems handling this regression stalled.
On 13.02.24 16:52, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 4:26 PM Guenter Roeck linux@roeck-us.net wrote:
On Tue, Feb 13, 2024 at 03:51:35PM +0100, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 3:29 PM Jisheng Zhang jszhang@kernel.org wrote:
On Sun, Feb 11, 2024 at 08:30:21PM -0800, Guenter Roeck wrote:
On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote:
As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
- fields updated only under the tx queue lock,
- fields updated only during NAPI poll,
- fields updated only from interrupt context,
Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own.
Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats().
For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats.
Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik petr@tesarici.cz
This patch results in a lockdep splat. Backtrace and bisect results attached.
[ 33.736728] ================================ [ 33.736805] WARNING: inconsistent lock state [ 33.736953] 6.8.0-rc4 #1 Tainted: G N [ 33.737080] -------------------------------- [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.738206] {HARDIRQ-ON-W} state was registered at: [ 33.738318] lock_acquire+0x11c/0x368 [ 33.738431] __u64_stats_update_begin+0x104/0x1ac [ 33.738525] stmmac_xmit+0x4d0/0xc58
interesting lockdep splat... stmmac_xmit() operates on txq_stats->q_syncp, while the sun8i_dwmac_dma_interrupt() operates on pcpu's priv->xstats.pcpu_stats they are different syncp. so how does lockdep splat happen.
Right, I do not see anything obvious yet.
Wild guess: I think it maybe saying that due to
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
the critical code may somehow be interrupted and, while handling the interrupt, try to acquire the same lock again.
This should not happen, the 'syncp' are different. They have different lockdep classes.
One is exclusively used from hard irq context.
The second one only used from BH context.
Alexis Lothoré hit this now as well, see yesterday report in this thread; apart from that nothing seem to have happened for two weeks now. The change recently made it to some stable/longterm kernels, too. Makes me wonder:
What's the plan forward here? Is this considered to be a false positive? Or a real problem? Or a kind of situation along the lines of "that commit should not cause the problem we are seeing, so it might have exposed a older bug in the code, but nobody looked closer yet to check"? Or something else?
Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat) -- Everything you wanna know about Linux kernel regression tracking: https://linux-regtracking.leemhuis.info/about/#tldr If I did something stupid, please tell me, as explained on that page.
On Wed, 28 Feb 2024 07:19:56 +0100 "Linux regression tracking (Thorsten Leemhuis)" regressions@leemhuis.info wrote:
Net maintainers, chiming in here, as it seems handling this regression stalled.
Indeed, I was too busy with sandbox mode...
On 13.02.24 16:52, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 4:26 PM Guenter Roeck linux@roeck-us.net wrote:
On Tue, Feb 13, 2024 at 03:51:35PM +0100, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 3:29 PM Jisheng Zhang jszhang@kernel.org wrote:
On Sun, Feb 11, 2024 at 08:30:21PM -0800, Guenter Roeck wrote:
On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote: > As explained by a comment in <linux/u64_stats_sync.h>, write side of struct > u64_stats_sync must ensure mutual exclusion, or one seqcount update could > be lost on 32-bit platforms, thus blocking readers forever. Such lockups > have been observed in real world after stmmac_xmit() on one CPU raced with > stmmac_napi_poll_tx() on another CPU. > > To fix the issue without introducing a new lock, split the statics into > three parts: > > 1. fields updated only under the tx queue lock, > 2. fields updated only during NAPI poll, > 3. fields updated only from interrupt context, > > Updates to fields in the first two groups are already serialized through > other locks. It is sufficient to split the existing struct u64_stats_sync > so that each group has its own. > > Note that tx_set_ic_bit is updated from both contexts. Split this counter > so that each context gets its own, and calculate their sum to get the total > value in stmmac_get_ethtool_stats(). > > For the third group, multiple interrupts may be processed by different CPUs > at the same time, but interrupts on the same CPU will not nest. Move fields > from this group to a newly created per-cpu struct stmmac_pcpu_stats. > > Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") > Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ > Cc: stable@vger.kernel.org > Signed-off-by: Petr Tesarik petr@tesarici.cz
This patch results in a lockdep splat. Backtrace and bisect results attached.
[ 33.736728] ================================ [ 33.736805] WARNING: inconsistent lock state [ 33.736953] 6.8.0-rc4 #1 Tainted: G N [ 33.737080] -------------------------------- [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.738206] {HARDIRQ-ON-W} state was registered at: [ 33.738318] lock_acquire+0x11c/0x368 [ 33.738431] __u64_stats_update_begin+0x104/0x1ac [ 33.738525] stmmac_xmit+0x4d0/0xc58
interesting lockdep splat... stmmac_xmit() operates on txq_stats->q_syncp, while the sun8i_dwmac_dma_interrupt() operates on pcpu's priv->xstats.pcpu_stats they are different syncp. so how does lockdep splat happen.
Right, I do not see anything obvious yet.
Wild guess: I think it maybe saying that due to
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
the critical code may somehow be interrupted and, while handling the interrupt, try to acquire the same lock again.
This should not happen, the 'syncp' are different. They have different lockdep classes.
One is exclusively used from hard irq context.
The second one only used from BH context.
Alexis Lothoré hit this now as well, see yesterday report in this thread; apart from that nothing seem to have happened for two weeks now. The change recently made it to some stable/longterm kernels, too. Makes me wonder:
What's the plan forward here? Is this considered to be a false positive?
Although my system has run stable for a couple of months, I am hesitant to call it a false positive.
Or a real problem?
That's what I think. But I would have to learn a lot about the network stack to understand what exactly happens here.
It may go faster if somebody else on the Cc can give me a hint where to start looking based on the lockdep warning.
Petr T
On 28.02.24 12:03, Petr Tesařík wrote:
On Wed, 28 Feb 2024 07:19:56 +0100 "Linux regression tracking (Thorsten Leemhuis)" regressions@leemhuis.info wrote:
Net maintainers, chiming in here, as it seems handling this regression stalled.
Indeed, I was too busy with sandbox mode...
Hmm, no reply in the past week to Petr's request for help from someone with more knowledge about the field. :-/
So I guess this means that this won't be fixed for 6.8? Unfortunate, but well, that's how it it sometimes.
Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat) -- Everything you wanna know about Linux kernel regression tracking: https://linux-regtracking.leemhuis.info/about/#tldr If I did something stupid, please tell me, as explained on that page.
#regzbot poke
On 13.02.24 16:52, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 4:26 PM Guenter Roeck linux@roeck-us.net wrote:
On Tue, Feb 13, 2024 at 03:51:35PM +0100, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 3:29 PM Jisheng Zhang jszhang@kernel.org wrote:
On Sun, Feb 11, 2024 at 08:30:21PM -0800, Guenter Roeck wrote: > On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote: >> As explained by a comment in <linux/u64_stats_sync.h>, write side of struct >> u64_stats_sync must ensure mutual exclusion, or one seqcount update could >> be lost on 32-bit platforms, thus blocking readers forever. Such lockups >> have been observed in real world after stmmac_xmit() on one CPU raced with >> stmmac_napi_poll_tx() on another CPU. >> >> To fix the issue without introducing a new lock, split the statics into >> three parts: >> >> 1. fields updated only under the tx queue lock, >> 2. fields updated only during NAPI poll, >> 3. fields updated only from interrupt context, >> >> Updates to fields in the first two groups are already serialized through >> other locks. It is sufficient to split the existing struct u64_stats_sync >> so that each group has its own. >> >> Note that tx_set_ic_bit is updated from both contexts. Split this counter >> so that each context gets its own, and calculate their sum to get the total >> value in stmmac_get_ethtool_stats(). >> >> For the third group, multiple interrupts may be processed by different CPUs >> at the same time, but interrupts on the same CPU will not nest. Move fields >> from this group to a newly created per-cpu struct stmmac_pcpu_stats. >> >> Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") >> Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ >> Cc: stable@vger.kernel.org >> Signed-off-by: Petr Tesarik petr@tesarici.cz > > This patch results in a lockdep splat. Backtrace and bisect results attached. > > --- > [ 33.736728] ================================ > [ 33.736805] WARNING: inconsistent lock state > [ 33.736953] 6.8.0-rc4 #1 Tainted: G N > [ 33.737080] -------------------------------- > [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. > [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: > [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c > [ 33.738206] {HARDIRQ-ON-W} state was registered at: > [ 33.738318] lock_acquire+0x11c/0x368 > [ 33.738431] __u64_stats_update_begin+0x104/0x1ac > [ 33.738525] stmmac_xmit+0x4d0/0xc58
interesting lockdep splat... stmmac_xmit() operates on txq_stats->q_syncp, while the sun8i_dwmac_dma_interrupt() operates on pcpu's priv->xstats.pcpu_stats they are different syncp. so how does lockdep splat happen.
Right, I do not see anything obvious yet.
Wild guess: I think it maybe saying that due to
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
the critical code may somehow be interrupted and, while handling the interrupt, try to acquire the same lock again.
This should not happen, the 'syncp' are different. They have different lockdep classes.
One is exclusively used from hard irq context.
The second one only used from BH context.
Alexis Lothoré hit this now as well, see yesterday report in this thread; apart from that nothing seem to have happened for two weeks now. The change recently made it to some stable/longterm kernels, too. Makes me wonder:
What's the plan forward here? Is this considered to be a false positive?
Although my system has run stable for a couple of months, I am hesitant to call it a false positive.
Or a real problem?
That's what I think. But I would have to learn a lot about the network stack to understand what exactly happens here.
It may go faster if somebody else on the Cc can give me a hint where to start looking based on the lockdep warning.
Petr T
On Wed, 6 Mar 2024 09:23:53 +0100 "Linux regression tracking (Thorsten Leemhuis)" regressions@leemhuis.info wrote:
On 28.02.24 12:03, Petr Tesařík wrote:
On Wed, 28 Feb 2024 07:19:56 +0100 "Linux regression tracking (Thorsten Leemhuis)" regressions@leemhuis.info wrote:
Net maintainers, chiming in here, as it seems handling this regression stalled.
Indeed, I was too busy with sandbox mode...
Hmm, no reply in the past week to Petr's request for help from someone with more knowledge about the field. :-/
So I guess this means that this won't be fixed for 6.8? Unfortunate, but well, that's how it it sometimes.
For the record, I _can_ reproduce lockdep splats on my device, but they don't make any sense to me. They seem to confirm Jisheng Zhang's conclusion that lockdep conflates two locks which should have different lock-classes.
So far I have noticed only one issue: the per-cpu syncp's are not initialized. I'll recompile and see if that's what confuses lockdep.
Petr T
Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
Everything you wanna know about Linux kernel regression tracking: https://linux-regtracking.leemhuis.info/about/#tldr If I did something stupid, please tell me, as explained on that page.
#regzbot poke
On 13.02.24 16:52, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 4:26 PM Guenter Roeck linux@roeck-us.net wrote:
On Tue, Feb 13, 2024 at 03:51:35PM +0100, Eric Dumazet wrote:
On Tue, Feb 13, 2024 at 3:29 PM Jisheng Zhang jszhang@kernel.org wrote: > On Sun, Feb 11, 2024 at 08:30:21PM -0800, Guenter Roeck wrote: >> On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote: >>> As explained by a comment in <linux/u64_stats_sync.h>, write side of struct >>> u64_stats_sync must ensure mutual exclusion, or one seqcount update could >>> be lost on 32-bit platforms, thus blocking readers forever. Such lockups >>> have been observed in real world after stmmac_xmit() on one CPU raced with >>> stmmac_napi_poll_tx() on another CPU. >>> >>> To fix the issue without introducing a new lock, split the statics into >>> three parts: >>> >>> 1. fields updated only under the tx queue lock, >>> 2. fields updated only during NAPI poll, >>> 3. fields updated only from interrupt context, >>> >>> Updates to fields in the first two groups are already serialized through >>> other locks. It is sufficient to split the existing struct u64_stats_sync >>> so that each group has its own. >>> >>> Note that tx_set_ic_bit is updated from both contexts. Split this counter >>> so that each context gets its own, and calculate their sum to get the total >>> value in stmmac_get_ethtool_stats(). >>> >>> For the third group, multiple interrupts may be processed by different CPUs >>> at the same time, but interrupts on the same CPU will not nest. Move fields >>> from this group to a newly created per-cpu struct stmmac_pcpu_stats. >>> >>> Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") >>> Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ >>> Cc: stable@vger.kernel.org >>> Signed-off-by: Petr Tesarik petr@tesarici.cz >> >> This patch results in a lockdep splat. Backtrace and bisect results attached. >> >> --- >> [ 33.736728] ================================ >> [ 33.736805] WARNING: inconsistent lock state >> [ 33.736953] 6.8.0-rc4 #1 Tainted: G N >> [ 33.737080] -------------------------------- >> [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. >> [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: >> [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c >> [ 33.738206] {HARDIRQ-ON-W} state was registered at: >> [ 33.738318] lock_acquire+0x11c/0x368 >> [ 33.738431] __u64_stats_update_begin+0x104/0x1ac >> [ 33.738525] stmmac_xmit+0x4d0/0xc58 > > interesting lockdep splat... > stmmac_xmit() operates on txq_stats->q_syncp, while the > sun8i_dwmac_dma_interrupt() operates on pcpu's priv->xstats.pcpu_stats > they are different syncp. so how does lockdep splat happen.
Right, I do not see anything obvious yet.
Wild guess: I think it maybe saying that due to
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
the critical code may somehow be interrupted and, while handling the interrupt, try to acquire the same lock again.
This should not happen, the 'syncp' are different. They have different lockdep classes.
One is exclusively used from hard irq context.
The second one only used from BH context.
Alexis Lothoré hit this now as well, see yesterday report in this thread; apart from that nothing seem to have happened for two weeks now. The change recently made it to some stable/longterm kernels, too. Makes me wonder:
What's the plan forward here? Is this considered to be a false positive?
Although my system has run stable for a couple of months, I am hesitant to call it a false positive.
Or a real problem?
That's what I think. But I would have to learn a lot about the network stack to understand what exactly happens here.
It may go faster if somebody else on the Cc can give me a hint where to start looking based on the lockdep warning.
Petr T
On Wed, 6 Mar 2024 10:01:53 +0100 Petr Tesařík petr@tesarici.cz wrote:
On Wed, 6 Mar 2024 09:23:53 +0100 "Linux regression tracking (Thorsten Leemhuis)" regressions@leemhuis.info wrote:
On 28.02.24 12:03, Petr Tesařík wrote:
On Wed, 28 Feb 2024 07:19:56 +0100 "Linux regression tracking (Thorsten Leemhuis)" regressions@leemhuis.info wrote:
Net maintainers, chiming in here, as it seems handling this regression stalled.
Indeed, I was too busy with sandbox mode...
Hmm, no reply in the past week to Petr's request for help from someone with more knowledge about the field. :-/
So I guess this means that this won't be fixed for 6.8? Unfortunate, but well, that's how it it sometimes.
For the record, I _can_ reproduce lockdep splats on my device, but they don't make any sense to me. They seem to confirm Jisheng Zhang's conclusion that lockdep conflates two locks which should have different lock-classes.
So far I have noticed only one issue: the per-cpu syncp's are not initialized. I'll recompile and see if that's what confuses lockdep.
That wasn't the issue. FTR the syncp was in fact initialized, because devm_netdev_alloc_pcpu_stats() is a macro that also takes care of the initialization of the syncp struct field.
The problem is u64_stats_init().
Commit 9464ca650008 ("net: make u64_stats_init() a function") changed it to an inline function. But that's wrong. It uses seqcount_init(), which in turn declares:
static struct lock_class_key __key;
This assumes that each lock gets its own instance. But if u64_stats_init() is a function (albeit an inline one), all calls within the same file end up using the same instance.
Eric, would it be OK to revert the above-mentioned commit?
Petr T
On Wed, Mar 6, 2024 at 11:03 AM Petr Tesařík petr@tesarici.cz wrote:
On Wed, 6 Mar 2024 10:01:53 +0100 Petr Tesařík petr@tesarici.cz wrote:
On Wed, 6 Mar 2024 09:23:53 +0100 "Linux regression tracking (Thorsten Leemhuis)" regressions@leemhuis.info wrote:
On 28.02.24 12:03, Petr Tesařík wrote:
On Wed, 28 Feb 2024 07:19:56 +0100 "Linux regression tracking (Thorsten Leemhuis)" regressions@leemhuis.info wrote:
Net maintainers, chiming in here, as it seems handling this regression stalled.
Indeed, I was too busy with sandbox mode...
Hmm, no reply in the past week to Petr's request for help from someone with more knowledge about the field. :-/
So I guess this means that this won't be fixed for 6.8? Unfortunate, but well, that's how it it sometimes.
For the record, I _can_ reproduce lockdep splats on my device, but they don't make any sense to me. They seem to confirm Jisheng Zhang's conclusion that lockdep conflates two locks which should have different lock-classes.
So far I have noticed only one issue: the per-cpu syncp's are not initialized. I'll recompile and see if that's what confuses lockdep.
That wasn't the issue. FTR the syncp was in fact initialized, because devm_netdev_alloc_pcpu_stats() is a macro that also takes care of the initialization of the syncp struct field.
The problem is u64_stats_init().
Commit 9464ca650008 ("net: make u64_stats_init() a function") changed it to an inline function. But that's wrong. It uses seqcount_init(), which in turn declares:
static struct lock_class_key __key;
This assumes that each lock gets its own instance. But if u64_stats_init() is a function (albeit an inline one), all calls within the same file end up using the same instance.
Eric, would it be OK to revert the above-mentioned commit?
Oh, nice !
Well, this would not be a revert, let's keep type safety checks if possible.
Thanks.
Adding report to regression tracker.
#regzbot ^introduced 38cc3c6dcc09 #regzbot title Inconsistent lock state in stmmac code #regzbot ignore-activity
On Sun, Feb 11, 2024 at 08:30:21PM -0800, Guenter Roeck wrote:
Hi,
On Sat, Feb 03, 2024 at 08:09:27PM +0100, Petr Tesarik wrote:
As explained by a comment in <linux/u64_stats_sync.h>, write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU.
To fix the issue without introducing a new lock, split the statics into three parts:
- fields updated only under the tx queue lock,
- fields updated only during NAPI poll,
- fields updated only from interrupt context,
Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own.
Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats().
For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats.
Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik petr@tesarici.cz
This patch results in a lockdep splat. Backtrace and bisect results attached.
Guenter
[ 33.736728] ================================ [ 33.736805] WARNING: inconsistent lock state [ 33.736953] 6.8.0-rc4 #1 Tainted: G N [ 33.737080] -------------------------------- [ 33.737155] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 33.737309] kworker/0:2/39 [HC1[1]:SC0[2]:HE0:SE0] takes: [ 33.737459] ef792074 (&syncp->seq#2){?...}-{0:0}, at: sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.738206] {HARDIRQ-ON-W} state was registered at: [ 33.738318] lock_acquire+0x11c/0x368 [ 33.738431] __u64_stats_update_begin+0x104/0x1ac [ 33.738525] stmmac_xmit+0x4d0/0xc58 [ 33.738605] dev_hard_start_xmit+0xc4/0x2a0 [ 33.738689] sch_direct_xmit+0xf8/0x30c [ 33.738763] __dev_queue_xmit+0x400/0xcc4 [ 33.738831] ip6_finish_output2+0x254/0xafc [ 33.738903] mld_sendpack+0x260/0x5b0 [ 33.738969] mld_ifc_work+0x274/0x588 [ 33.739032] process_one_work+0x230/0x604 [ 33.739101] worker_thread+0x1dc/0x494 [ 33.739165] kthread+0x100/0x120 [ 33.739225] ret_from_fork+0x14/0x28 [ 33.739302] irq event stamp: 3553 [ 33.739371] hardirqs last enabled at (3552): [<c03e884c>] __call_rcu_common.constprop.0+0x1a4/0x6b4 [ 33.739515] hardirqs last disabled at (3553): [<c0300bd4>] __irq_svc+0x54/0xb8 [ 33.739638] softirqs last enabled at (3542): [<c1254a60>] neigh_resolve_output+0x1fc/0x254 [ 33.739795] softirqs last disabled at (3546): [<c1243798>] __dev_queue_xmit+0x48/0xcc4 [ 33.739919] [ 33.739919] other info that might help us debug this: [ 33.740021] Possible unsafe locking scenario: [ 33.740021] [ 33.740111] CPU0 [ 33.740158] ---- [ 33.740204] lock(&syncp->seq#2); [ 33.740314] <Interrupt> [ 33.740363] lock(&syncp->seq#2); [ 33.740511] [ 33.740511] *** DEADLOCK *** [ 33.740511] [ 33.740665] 8 locks held by kworker/0:2/39: [ 33.740761] #0: c4bfb2a8 ((wq_completion)mld){+.+.}-{0:0}, at: process_one_work+0x168/0x604 [ 33.741025] #1: f0909f20 ((work_completion)(&(&idev->mc_ifc_work)->work)){+.+.}-{0:0}, at: process_one_work+0x168/0x604 [ 33.741230] #2: c328baac (&idev->mc_lock){+.+.}-{3:3}, at: mld_ifc_work+0x24/0x588 [ 33.741387] #3: c2191488 (rcu_read_lock){....}-{1:2}, at: mld_sendpack+0x0/0x5b0 [ 33.741553] #4: c2191488 (rcu_read_lock){....}-{1:2}, at: ip6_finish_output2+0x174/0xafc [ 33.741716] #5: c219149c (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x48/0xcc4 [ 33.741877] #6: c4d3a974 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x334/0xcc4 [ 33.742070] #7: c49e5050 (_xmit_ETHER#2){+...}-{2:2}, at: sch_direct_xmit+0x158/0x30c [ 33.742250] [ 33.742250] stack backtrace: [ 33.742426] CPU: 0 PID: 39 Comm: kworker/0:2 Tainted: G N 6.8.0-rc4 #1 [ 33.742578] Hardware name: Allwinner sun8i Family [ 33.742776] Workqueue: mld mld_ifc_work [ 33.742998] unwind_backtrace from show_stack+0x10/0x14 [ 33.743119] show_stack from dump_stack_lvl+0x68/0x90 [ 33.743232] dump_stack_lvl from mark_lock.part.0+0xbd8/0x12d8 [ 33.743345] mark_lock.part.0 from __lock_acquire+0xad4/0x224c [ 33.743458] __lock_acquire from lock_acquire+0x11c/0x368 [ 33.743564] lock_acquire from __u64_stats_update_begin+0x104/0x1ac [ 33.743683] __u64_stats_update_begin from sun8i_dwmac_dma_interrupt+0x9c/0x28c [ 33.743805] sun8i_dwmac_dma_interrupt from stmmac_napi_check+0x40/0x1c8 [ 33.743917] stmmac_napi_check from stmmac_interrupt+0xa4/0x154 [ 33.744020] stmmac_interrupt from __handle_irq_event_percpu+0xcc/0x2ec [ 33.744134] __handle_irq_event_percpu from handle_irq_event+0x38/0x80 [ 33.744243] handle_irq_event from handle_fasteoi_irq+0x9c/0x1c4 [ 33.744346] handle_fasteoi_irq from generic_handle_domain_irq+0x28/0x38 [ 33.744459] generic_handle_domain_irq from gic_handle_irq+0x98/0xcc [ 33.744567] gic_handle_irq from generic_handle_arch_irq+0x34/0x44 [ 33.744673] generic_handle_arch_irq from call_with_stack+0x18/0x20 [ 33.744831] call_with_stack from __irq_svc+0x9c/0xb8 [ 33.745018] Exception stack(0xf0909c00 to 0xf0909c48) [ 33.745221] 9c00: f0ab0000 c49e506c 0000005a 00000000 c0000006 f0ab0014 0000005a c0f5da68 [ 33.745387] 9c20: c35bd810 c4b50000 c4b50000 c365d300 00000000 f0909c50 c0f70a70 c0f70a74 [ 33.745574] 9c40: 60000013 ffffffff [ 33.745668] __irq_svc from sun8i_dwmac_enable_dma_transmission+0x20/0x24 [ 33.745809] sun8i_dwmac_enable_dma_transmission from stmmac_xmit+0x790/0xc58 [ 33.745975] stmmac_xmit from dev_hard_start_xmit+0xc4/0x2a0 [ 33.746100] dev_hard_start_xmit from sch_direct_xmit+0xf8/0x30c [ 33.746220] sch_direct_xmit from __dev_queue_xmit+0x400/0xcc4 [ 33.746350] __dev_queue_xmit from ip6_finish_output2+0x254/0xafc [ 33.746462] ip6_finish_output2 from mld_sendpack+0x260/0x5b0 [ 33.746568] mld_sendpack from mld_ifc_work+0x274/0x588 [ 33.746670] mld_ifc_work from process_one_work+0x230/0x604 [ 33.746793] process_one_work from worker_thread+0x1dc/0x494 [ 33.746906] worker_thread from kthread+0x100/0x120 [ 33.746994] kthread from ret_from_fork+0x14/0x28 [ 33.747076] Exception stack(0xf0909fb0 to 0xf0909ff8) [ 33.747165] 9fa0: 00000000 00000000 00000000 00000000 [ 33.747303] 9fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 33.747433] 9fe0: 00000000 00000000 00000000 00000000 00000013 00000000
# bad: [841c35169323cd833294798e58b9bf63fa4fa1de] Linux 6.8-rc4 # good: [54be6c6c5ae8e0d93a6c4641cb7528eb0b6ba478] Linux 6.8-rc3 git bisect start 'HEAD' 'v6.8-rc3' # bad: [c76b766ec50d3d43e2dacea53a733b285f4b730d] Merge tag 'drm-fixes-2024-02-09' of git://anongit.freedesktop.org/drm/drm git bisect bad c76b766ec50d3d43e2dacea53a733b285f4b730d # bad: [63e4b9d693e0f8c28359c7ea81e1ee510864c37b] Merge tag 'nf-24-02-08' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf git bisect bad 63e4b9d693e0f8c28359c7ea81e1ee510864c37b # bad: [75428f537d7cae33c7e4dd726144074f78622c09] net: intel: fix old compiler regressions git bisect bad 75428f537d7cae33c7e4dd726144074f78622c09 # good: [1a1c13303ff6d64e6f718dc8aa614e580ca8d9b4] nfp: flower: prevent re-adding mac index for bonded port git bisect good 1a1c13303ff6d64e6f718dc8aa614e580ca8d9b4 # good: [3871aa01e1a779d866fa9dfdd5a836f342f4eb87] tipc: Check the bearer type before calling tipc_udp_nl_bearer_add() git bisect good 3871aa01e1a779d866fa9dfdd5a836f342f4eb87 # good: [58086721b7781c3e35b19c9b78c8f5a791070ba3] devlink: avoid potential loop in devlink_rel_nested_in_notify_work() git bisect good 58086721b7781c3e35b19c9b78c8f5a791070ba3 # bad: [38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8] net: stmmac: protect updates of 64-bit statistics counters git bisect bad 38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8 # good: [cb88cb53badb8aeb3955ad6ce80b07b598e310b8] ppp_async: limit MRU to 64K git bisect good cb88cb53badb8aeb3955ad6ce80b07b598e310b8 # first bad commit: [38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8] net: stmmac: protect updates of 64-bit statistics counters
linux-stable-mirror@lists.linaro.org