Each skb as the element in RX ring was expected with sized buffer 8216 (RTK_PCI_RX_BUF_SIZE) bytes. However, the skb buffer's true size is 16640 bytes for alignment after allocated, x86_64 for example. And, the difference will be enlarged 512 times (RTK_MAX_RX_DESC_NUM). To prevent that much wasted memory, this patch follows David's suggestion [1] and uses general buffer arrays, instead of skbs as the elements in RX ring.
[1] https://www.spinics.net/lists/linux-wireless/msg187870.html
Signed-off-by: Jian-Hong Pan jian-hong@endlessm.com Cc: stable@vger.kernel.org --- drivers/net/wireless/realtek/rtw88/pci.c | 132 +++++++++++++---------- drivers/net/wireless/realtek/rtw88/pci.h | 2 +- 2 files changed, 75 insertions(+), 59 deletions(-)
diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c index 23dd06afef3d..e953010f0179 100644 --- a/drivers/net/wireless/realtek/rtw88/pci.c +++ b/drivers/net/wireless/realtek/rtw88/pci.c @@ -111,25 +111,49 @@ static void rtw_pci_free_tx_ring(struct rtw_dev *rtwdev, tx_ring->r.head = NULL; }
+static struct rtw_pci_rx_buffer_desc *rtw_pci_get_rx_desc( + struct rtw_pci_rx_ring *rx_ring, + u32 idx) +{ + struct rtw_pci_rx_buffer_desc *buf_desc; + u32 desc_sz = rx_ring->r.desc_size; + + buf_desc = (struct rtw_pci_rx_buffer_desc *)(rx_ring->r.head + + idx * desc_sz); + return buf_desc; +} + +static dma_addr_t rtw_pci_get_rx_bufdma(struct rtw_pci_rx_ring *rx_ring, + u32 idx) +{ + struct rtw_pci_rx_buffer_desc *buf_desc; + dma_addr_t dma; + + buf_desc = rtw_pci_get_rx_desc(rx_ring, idx); + dma = le32_to_cpu(buf_desc->dma); + + return dma; +} + static void rtw_pci_free_rx_ring(struct rtw_dev *rtwdev, struct rtw_pci_rx_ring *rx_ring) { struct pci_dev *pdev = to_pci_dev(rtwdev->dev); - struct sk_buff *skb; + u8 *buf; dma_addr_t dma; u8 *head = rx_ring->r.head; int buf_sz = RTK_PCI_RX_BUF_SIZE; int ring_sz = rx_ring->r.desc_size * rx_ring->r.len; - int i; + u32 i;
for (i = 0; i < rx_ring->r.len; i++) { - skb = rx_ring->buf[i]; - if (!skb) + buf = rx_ring->buf[i]; + if (!buf) continue;
- dma = *((dma_addr_t *)skb->cb); - pci_unmap_single(pdev, dma, buf_sz, PCI_DMA_FROMDEVICE); - dev_kfree_skb(skb); + dma = rtw_pci_get_rx_bufdma(rx_ring, i); + pci_unmap_single(pdev, dma, buf_sz, DMA_FROM_DEVICE); + devm_kfree(rtwdev->dev, buf); rx_ring->buf[i] = NULL; }
@@ -180,27 +204,24 @@ static int rtw_pci_init_tx_ring(struct rtw_dev *rtwdev, return 0; }
-static int rtw_pci_reset_rx_desc(struct rtw_dev *rtwdev, struct sk_buff *skb, - struct rtw_pci_rx_ring *rx_ring, - u32 idx, u32 desc_sz) +static int rtw_pci_reset_rx_desc(struct rtw_dev *rtwdev, u8 *buf, + struct rtw_pci_rx_ring *rx_ring, u32 idx) { struct pci_dev *pdev = to_pci_dev(rtwdev->dev); struct rtw_pci_rx_buffer_desc *buf_desc; int buf_sz = RTK_PCI_RX_BUF_SIZE; dma_addr_t dma;
- if (!skb) + if (!buf) return -EINVAL;
- dma = pci_map_single(pdev, skb->data, buf_sz, PCI_DMA_FROMDEVICE); + dma = pci_map_single(pdev, buf, buf_sz, DMA_FROM_DEVICE); if (pci_dma_mapping_error(pdev, dma)) return -EBUSY;
- *((dma_addr_t *)skb->cb) = dma; - buf_desc = (struct rtw_pci_rx_buffer_desc *)(rx_ring->r.head + - idx * desc_sz); - memset(buf_desc, 0, sizeof(*buf_desc)); + buf_desc = rtw_pci_get_rx_desc(rx_ring, idx); buf_desc->buf_size = cpu_to_le16(RTK_PCI_RX_BUF_SIZE); + buf_desc->total_pkt_size = cpu_to_le16(0); buf_desc->dma = cpu_to_le32(dma);
return 0; @@ -208,7 +229,7 @@ static int rtw_pci_reset_rx_desc(struct rtw_dev *rtwdev, struct sk_buff *skb,
static void rtw_pci_sync_rx_desc_device(struct rtw_dev *rtwdev, dma_addr_t dma, struct rtw_pci_rx_ring *rx_ring, - u32 idx, u32 desc_sz) + u32 idx) { struct device *dev = rtwdev->dev; struct rtw_pci_rx_buffer_desc *buf_desc; @@ -216,10 +237,9 @@ static void rtw_pci_sync_rx_desc_device(struct rtw_dev *rtwdev, dma_addr_t dma,
dma_sync_single_for_device(dev, dma, buf_sz, DMA_FROM_DEVICE);
- buf_desc = (struct rtw_pci_rx_buffer_desc *)(rx_ring->r.head + - idx * desc_sz); - memset(buf_desc, 0, sizeof(*buf_desc)); + buf_desc = rtw_pci_get_rx_desc(rx_ring, idx); buf_desc->buf_size = cpu_to_le16(RTK_PCI_RX_BUF_SIZE); + buf_desc->total_pkt_size = cpu_to_le16(0); buf_desc->dma = cpu_to_le32(dma); }
@@ -228,12 +248,12 @@ static int rtw_pci_init_rx_ring(struct rtw_dev *rtwdev, u8 desc_size, u32 len) { struct pci_dev *pdev = to_pci_dev(rtwdev->dev); - struct sk_buff *skb = NULL; + u8 *buf = NULL; dma_addr_t dma; u8 *head; int ring_sz = desc_size * len; int buf_sz = RTK_PCI_RX_BUF_SIZE; - int i, allocated; + u32 i, allocated; int ret = 0;
head = pci_zalloc_consistent(pdev, ring_sz, &dma); @@ -242,41 +262,39 @@ static int rtw_pci_init_rx_ring(struct rtw_dev *rtwdev, return -ENOMEM; } rx_ring->r.head = head; + rx_ring->r.dma = dma; + rx_ring->r.len = len; + rx_ring->r.desc_size = desc_size; + rx_ring->r.wp = 0; + rx_ring->r.rp = 0;
for (i = 0; i < len; i++) { - skb = dev_alloc_skb(buf_sz); - if (!skb) { + buf = devm_kzalloc(rtwdev->dev, buf_sz, GFP_ATOMIC); + if (!buf) { allocated = i; ret = -ENOMEM; goto err_out; }
- memset(skb->data, 0, buf_sz); - rx_ring->buf[i] = skb; - ret = rtw_pci_reset_rx_desc(rtwdev, skb, rx_ring, i, desc_size); + rx_ring->buf[i] = buf; + ret = rtw_pci_reset_rx_desc(rtwdev, buf, rx_ring, i); if (ret) { allocated = i; - dev_kfree_skb_any(skb); + devm_kfree(rtwdev->dev, buf); goto err_out; } }
- rx_ring->r.dma = dma; - rx_ring->r.len = len; - rx_ring->r.desc_size = desc_size; - rx_ring->r.wp = 0; - rx_ring->r.rp = 0; - return 0;
err_out: for (i = 0; i < allocated; i++) { - skb = rx_ring->buf[i]; - if (!skb) + buf = rx_ring->buf[i]; + if (!buf) continue; - dma = *((dma_addr_t *)skb->cb); - pci_unmap_single(pdev, dma, buf_sz, PCI_DMA_FROMDEVICE); - dev_kfree_skb_any(skb); + dma = rtw_pci_get_rx_bufdma(rx_ring, i); + pci_unmap_single(pdev, dma, buf_sz, DMA_FROM_DEVICE); + devm_kfree(rtwdev->dev, buf); rx_ring->buf[i] = NULL; } pci_free_consistent(pdev, ring_sz, head, dma); @@ -776,13 +794,12 @@ static void rtw_pci_rx_isr(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci, struct rtw_pci_rx_ring *ring; struct rtw_rx_pkt_stat pkt_stat; struct ieee80211_rx_status rx_status; - struct sk_buff *skb, *new; + struct sk_buff *skb; u32 cur_wp, cur_rp, tmp; u32 count; u32 pkt_offset; u32 pkt_desc_sz = chip->rx_pkt_desc_sz; - u32 buf_desc_sz = chip->rx_buf_desc_sz; - u32 new_len; + u32 len; u8 *rx_desc; dma_addr_t dma;
@@ -799,11 +816,11 @@ static void rtw_pci_rx_isr(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci, cur_rp = ring->r.rp; while (count--) { rtw_pci_dma_check(rtwdev, ring, cur_rp); - skb = ring->buf[cur_rp]; - dma = *((dma_addr_t *)skb->cb); + /* buffer is already filled as rx_desc */ + rx_desc = ring->buf[cur_rp]; + dma = rtw_pci_get_rx_bufdma(ring, cur_rp); dma_sync_single_for_cpu(rtwdev->dev, dma, RTK_PCI_RX_BUF_SIZE, DMA_FROM_DEVICE); - rx_desc = skb->data; chip->ops->query_rx_desc(rtwdev, rx_desc, &pkt_stat, &rx_status);
/* offset from rx_desc to payload */ @@ -813,32 +830,31 @@ static void rtw_pci_rx_isr(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci, /* allocate a new skb for this frame, * discard the frame if none available */ - new_len = pkt_stat.pkt_len + pkt_offset; - new = dev_alloc_skb(new_len); - if (WARN_ONCE(!new, "rx routine starvation\n")) + len = pkt_stat.pkt_len + pkt_offset; + skb = dev_alloc_skb(len); + if (WARN_ONCE(!skb, "rx routine starvation\n")) goto next_rp;
/* put the DMA data including rx_desc from phy to new skb */ - skb_put_data(new, skb->data, new_len); + skb_put_data(skb, rx_desc, len);
if (pkt_stat.is_c2h) { /* pass rx_desc & offset for further operation */ - *((u32 *)new->cb) = pkt_offset; - skb_queue_tail(&rtwdev->c2h_queue, new); + *((u32 *)skb->cb) = pkt_offset; + skb_queue_tail(&rtwdev->c2h_queue, skb); ieee80211_queue_work(rtwdev->hw, &rtwdev->c2h_work); } else { /* remove rx_desc */ - skb_pull(new, pkt_offset); + skb_pull(skb, pkt_offset);
- rtw_rx_stats(rtwdev, pkt_stat.vif, new); - memcpy(new->cb, &rx_status, sizeof(rx_status)); - ieee80211_rx_irqsafe(rtwdev->hw, new); + rtw_rx_stats(rtwdev, pkt_stat.vif, skb); + memcpy(skb->cb, &rx_status, sizeof(rx_status)); + ieee80211_rx_irqsafe(rtwdev->hw, skb); }
next_rp: - /* new skb delivered to mac80211, re-enable original skb DMA */ - rtw_pci_sync_rx_desc_device(rtwdev, dma, ring, cur_rp, - buf_desc_sz); + /* new skb delivered to mac80211, re-enable original buf DMA */ + rtw_pci_sync_rx_desc_device(rtwdev, dma, ring, cur_rp);
/* host read next element in ring */ if (++cur_rp >= ring->r.len) diff --git a/drivers/net/wireless/realtek/rtw88/pci.h b/drivers/net/wireless/realtek/rtw88/pci.h index 87824a4caba9..283685421a64 100644 --- a/drivers/net/wireless/realtek/rtw88/pci.h +++ b/drivers/net/wireless/realtek/rtw88/pci.h @@ -174,7 +174,7 @@ struct rtw_pci_rx_buffer_desc {
struct rtw_pci_rx_ring { struct rtw_pci_ring r; - struct sk_buff *buf[RTK_MAX_RX_DESC_NUM]; + u8 *buf[RTK_MAX_RX_DESC_NUM]; };
#define RX_TAG_MAX 8192
From: Jian-Hong Pan
Sent: 25 July 2019 09:09 Each skb as the element in RX ring was expected with sized buffer 8216 (RTK_PCI_RX_BUF_SIZE) bytes. However, the skb buffer's true size is 16640 bytes for alignment after allocated, x86_64 for example. And, the difference will be enlarged 512 times (RTK_MAX_RX_DESC_NUM). To prevent that much wasted memory, this patch follows David's suggestion [1] and uses general buffer arrays, instead of skbs as the elements in RX ring.
...
for (i = 0; i < len; i++) {
skb = dev_alloc_skb(buf_sz);
if (!skb) {
buf = devm_kzalloc(rtwdev->dev, buf_sz, GFP_ATOMIC);
You should do this allocation somewhere than can sleep. So you don't need GFP_ATOMIC, making the allocate (and dma map) much less likely to fail. If they do fail using a smaller ring might be better than failing completely.
I suspect that buf_sz gets rounded up somewhat. Also you almost certainly want 'buf' to be cache-line aligned. I don't think devm_kzalloc() guarantees that at all.
While allocating all 512 buffers in one block (just over 4MB) is probably not a good idea, you may need to allocated (and dma map) then in groups.
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
David Laight David.Laight@aculab.com 於 2019年7月25日 週四 下午5:21寫道:
From: Jian-Hong Pan
Sent: 25 July 2019 09:09 Each skb as the element in RX ring was expected with sized buffer 8216 (RTK_PCI_RX_BUF_SIZE) bytes. However, the skb buffer's true size is 16640 bytes for alignment after allocated, x86_64 for example. And, the difference will be enlarged 512 times (RTK_MAX_RX_DESC_NUM). To prevent that much wasted memory, this patch follows David's suggestion [1] and uses general buffer arrays, instead of skbs as the elements in RX ring.
...
for (i = 0; i < len; i++) {
skb = dev_alloc_skb(buf_sz);
if (!skb) {
buf = devm_kzalloc(rtwdev->dev, buf_sz, GFP_ATOMIC);
You should do this allocation somewhere than can sleep. So you don't need GFP_ATOMIC, making the allocate (and dma map) much less likely to fail. If they do fail using a smaller ring might be better than failing completely.
Ok, I can tweak and try it.
I suspect that buf_sz gets rounded up somewhat. Also you almost certainly want 'buf' to be cache-line aligned. I don't think devm_kzalloc() guarantees that at all.
Sure
While allocating all 512 buffers in one block (just over 4MB) is probably not a good idea, you may need to allocated (and dma map) then in groups.
Thanks for reviewing. But got questions here to double confirm the idea. According to original code, it allocates 512 skbs for RX ring and dma mapping one by one. So, the new code allocates memory buffer 512 times to get 512 buffer arrays. Will the 512 buffers arrays be in one block? Do you mean aggregate the buffers as a scatterlist and use dma_map_sg?
Thank you, Jain-Hong Pan
From: Jian-Hong Pan
Sent: 26 July 2019 07:18
...
While allocating all 512 buffers in one block (just over 4MB) is probably not a good idea, you may need to allocated (and dma map) then in groups.
Thanks for reviewing. But got questions here to double confirm the idea. According to original code, it allocates 512 skbs for RX ring and dma mapping one by one. So, the new code allocates memory buffer 512 times to get 512 buffer arrays. Will the 512 buffers arrays be in one block? Do you mean aggregate the buffers as a scatterlist and use dma_map_sg?
If you malloc a buffer of size (8192+32) the allocator will either round it up to a whole number of (often 4k) pages or to a power of 2 of pages - so either 12k of 16k. I think the Linux allocator does the latter. Some of the allocators also 'steal' a bit from the front of the buffer for 'red tape'.
OTOH malloc the space 15 buffers and the allocator will round the 15*(8192 + 32) up to 32*4k - and you waste under 8k across all the buffers.
You then dma_map the large buffer and split into the actual rx buffers. Repeat until you've filled the entire ring. The only complication is remembering the base address (and size) for the dma_unmap and free. Although there is plenty of padding to extend the buffer structure significantly without using more memory. Allocate in 15's and you (probably) have 512 bytes per buffer. Allocate in 31's and you have 256 bytes.
The problem is that larger allocates are more likely to fail (especially if the system has been running for some time). So you almost certainly want to be able to fall back to smaller allocates even though they use more memory.
I also wonder if you actually need 512 8k rx buffers to cover interrupt latency? I've not done any measurements for 20 years!
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
David Laight David.Laight@aculab.com 於 2019年7月26日 週五 下午5:23寫道:
From: Jian-Hong Pan
Sent: 26 July 2019 07:18
...
While allocating all 512 buffers in one block (just over 4MB) is probably not a good idea, you may need to allocated (and dma map) then in groups.
Thanks for reviewing. But got questions here to double confirm the idea. According to original code, it allocates 512 skbs for RX ring and dma mapping one by one. So, the new code allocates memory buffer 512 times to get 512 buffer arrays. Will the 512 buffers arrays be in one block? Do you mean aggregate the buffers as a scatterlist and use dma_map_sg?
If you malloc a buffer of size (8192+32) the allocator will either round it up to a whole number of (often 4k) pages or to a power of 2 of pages - so either 12k of 16k. I think the Linux allocator does the latter. Some of the allocators also 'steal' a bit from the front of the buffer for 'red tape'.
OTOH malloc the space 15 buffers and the allocator will round the 15*(8192 + 32) up to 32*4k - and you waste under 8k across all the buffers.
You then dma_map the large buffer and split into the actual rx buffers. Repeat until you've filled the entire ring. The only complication is remembering the base address (and size) for the dma_unmap and free. Although there is plenty of padding to extend the buffer structure significantly without using more memory. Allocate in 15's and you (probably) have 512 bytes per buffer. Allocate in 31's and you have 256 bytes.
The problem is that larger allocates are more likely to fail (especially if the system has been running for some time). So you almost certainly want to be able to fall back to smaller allocates even though they use more memory.
I also wonder if you actually need 512 8k rx buffers to cover interrupt latency? I've not done any measurements for 20 years!
Thanks for the explanation. I am not sure the combination of 512 8k RX buffers. Maybe Realtek folks can give us some idea. Tony Chuang any comment?
Jian-Hong Pan
While allocating all 512 buffers in one block (just over 4MB) is probably not a good idea, you may need to allocated (and dma map) then in groups.
Thanks for reviewing. But got questions here to double confirm the
idea.
According to original code, it allocates 512 skbs for RX ring and dma mapping one by one. So, the new code allocates memory buffer 512 times to get 512 buffer arrays. Will the 512 buffers arrays be in one block? Do you mean aggregate the buffers as a scatterlist and use dma_map_sg?
If you malloc a buffer of size (8192+32) the allocator will either round it up to a whole number of (often 4k) pages or to a power of 2 of pages - so either 12k of 16k. I think the Linux allocator does the latter. Some of the allocators also 'steal' a bit from the front of the buffer for 'red tape'.
OTOH malloc the space 15 buffers and the allocator will round the 15*(8192 + 32) up to 32*4k - and you waste under 8k across all the buffers.
You then dma_map the large buffer and split into the actual rx buffers. Repeat until you've filled the entire ring. The only complication is remembering the base address (and size) for the dma_unmap and free. Although there is plenty of padding to extend the buffer structure significantly without using more memory. Allocate in 15's and you (probably) have 512 bytes per buffer. Allocate in 31's and you have 256 bytes.
The problem is that larger allocates are more likely to fail (especially if the system has been running for some time). So you almost certainly want to be able to fall back to smaller allocates even though they use more memory.
I also wonder if you actually need 512 8k rx buffers to cover interrupt latency? I've not done any measurements for 20 years!
Thanks for the explanation. I am not sure the combination of 512 8k RX buffers. Maybe Realtek folks can give us some idea. Tony Chuang any comment?
Jian-Hong Pan
512 RX buffers is not necessary I think. But I haven't had a chance to test if reduce the number of RX SKBs could affect the latency. I can run some throughput tests and then decide a minimum numbers that RX ring requires. Or if you can try it.
Thanks. Yan-Hsuan
On Thu, Jul 25, 2019 at 04:09:26PM +0800, Jian-Hong Pan wrote:
Each skb as the element in RX ring was expected with sized buffer 8216 (RTK_PCI_RX_BUF_SIZE) bytes. However, the skb buffer's true size is 16640 bytes for alignment after allocated, x86_64 for example. And, the
rtw88 advertise IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454, so maximum AMSDU packet can be approximately 12kB. This might be accidental, but having 16kB skb's allow to handle such big AMSDUs. If you shrink buf size, you can probably override memory after buffer end.
difference will be enlarged 512 times (RTK_MAX_RX_DESC_NUM). To prevent that much wasted memory, this patch follows David's suggestion [1] and uses general buffer arrays, instead of skbs as the elements in RX ring.
[1] https://www.spinics.net/lists/linux-wireless/msg187870.html
Signed-off-by: Jian-Hong Pan jian-hong@endlessm.com Cc: stable@vger.kernel.org
This does not fix any serious problem, it actually most likely introduce memory corruption problem described above. Should not be targeted to stable anyway.
dev_kfree_skb_any(skb);
devm_kfree(rtwdev->dev, buf);
For what this is needed? devm_ allocations are used exactly to avoid manual freeing.
len = pkt_stat.pkt_len + pkt_offset;
skb = dev_alloc_skb(len);
if (WARN_ONCE(!skb, "rx routine starvation\n")) goto next_rp;
/* put the DMA data including rx_desc from phy to new skb */
skb_put_data(new, skb->data, new_len);
skb_put_data(skb, rx_desc, len);
Coping big packets it quite inefficient. What drivers usually do is copy only for small packets and for big ones allocate new rx buf (drop packet alloc if fail) and pas old buf to network stack via skb_add_rx_frag(). See iwlmvm as example.
Stanislaw
From: Stanislaw Gruszka
Sent: 30 July 2019 10:36
...
len = pkt_stat.pkt_len + pkt_offset;
skb = dev_alloc_skb(len);
if (WARN_ONCE(!skb, "rx routine starvation\n")) goto next_rp;
/* put the DMA data including rx_desc from phy to new skb */
skb_put_data(new, skb->data, new_len);
skb_put_data(skb, rx_desc, len);
Coping big packets it quite inefficient. What drivers usually do is copy only for small packets and for big ones allocate new rx buf (drop packet alloc if fail) and pas old buf to network stack via skb_add_rx_frag(). See iwlmvm as example.
If you have to do iommu setup/teardown then the breakeven point for (not) copying may be surprisingly large. You do need to do the measurements on a range of hardware. Coping is also likely to affect the L1 cache - unless you can copy quickly without polluting the cache.
It is all 'swings and roundabouts'.
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
linux-stable-mirror@lists.linaro.org