On Mon, Sep 29, 2025 at 02:19:51PM +0300, Dmitry Bogdanov wrote:
nvme uses page_frag_cache to preallocate PDU for each preallocated request of block device. Block devices are created in parallel threads, consequently page_frag_cache is used in not thread-safe manner. That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. tcp_sendmsg_locked+0x782/0xce0 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x8b/0xa0 nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 Then random panic may occur.
Fix that by serializing the usage of page_frag_cache.
Thank you for reporting this. I think we can fix it without blocking the async namespace scanning with a mutex, by switching from a per-queue page_frag_cache to per-cpu. There shouldn't be a need to keep the page_frag allocations isolated by queue anyway.
It would be great if you could test the patch which I'll send after this.
- Chris
Cc: stable@vger.kernel.org # 6.12 Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously") Signed-off-by: Dmitry Bogdanov d.bogdanov@yadro.com
drivers/nvme/host/tcp.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1413788ca7d52..823e07759e0d3 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -145,6 +145,7 @@ struct nvme_tcp_queue { struct mutex queue_lock; struct mutex send_mutex;
- struct mutex pf_cache_lock; struct llist_head req_list; struct list_head send_list;
@@ -556,9 +557,11 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue);
- mutex_lock(&queue->pf_cache_lock); req->pdu = page_frag_alloc(&queue->pf_cache, sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
- mutex_unlock(&queue->pf_cache_lock); if (!req->pdu) return -ENOMEM;
@@ -1420,9 +1423,11 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) struct nvme_tcp_request *async = &ctrl->async_req; u8 hdgst = nvme_tcp_hdgst_len(queue);
- mutex_lock(&queue->pf_cache_lock); async->pdu = page_frag_alloc(&queue->pf_cache, sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
- mutex_unlock(&queue->pf_cache_lock); if (!async->pdu) return -ENOMEM;
@@ -1450,6 +1455,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) kfree(queue->pdu); mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock);
- mutex_destroy(&queue->pf_cache_lock);
} static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) @@ -1772,6 +1778,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, INIT_LIST_HEAD(&queue->send_list); mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work);
- mutex_init(&queue->pf_cache_lock);
if (qid > 0) queue->cmnd_capsule_len = nctrl->ioccsz * 16; @@ -1903,6 +1910,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, err_destroy_mutex: mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock);
- mutex_destroy(&queue->pf_cache_lock); return ret;
} -- 2.25.1