When discarding descriptors with IN_ORDER, we should rewind next_avail_head otherwise it would run out of sync with last_avail_idx. This would cause driver to report "id X is not a head".
Fixing this by returning the number of descriptors that is used for each buffer via vhost_get_vq_desc_n() so caller can use the value while discarding descriptors.
Fixes: 67a873df0c41 ("vhost: basic in order support") Cc: stable@vger.kernel.org Signed-off-by: Jason Wang jasowang@redhat.com --- Changes since V1: - Add the function document - Tweak the variable name --- drivers/vhost/net.c | 53 ++++++++++++++++++------------ drivers/vhost/vhost.c | 76 +++++++++++++++++++++++++++++++++++-------- drivers/vhost/vhost.h | 10 +++++- 3 files changed, 103 insertions(+), 36 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 35ded4330431..8f7f50acb6d6 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net, static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num, - struct msghdr *msghdr, bool *busyloop_intr) + struct msghdr *msghdr, bool *busyloop_intr, + unsigned int *ndesc) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq;
- int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), - out_num, in_num, NULL, NULL); + int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), + out_num, in_num, NULL, NULL, ndesc);
if (r == tvq->num && tvq->busyloop_timeout) { /* Flush batched packets first */ @@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
- r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), - out_num, in_num, NULL, NULL); + r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), + out_num, in_num, NULL, NULL, ndesc); }
return r; @@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct msghdr *msg, unsigned int *out, unsigned int *in, - size_t *len, bool *busyloop_intr) + size_t *len, bool *busyloop_intr, + unsigned int *ndesc) { struct vhost_virtqueue *vq = &nvq->vq; int ret;
- ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); + ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, + busyloop_intr, ndesc);
if (ret < 0 || ret == vq->num) return ret; @@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); + unsigned int ndesc = 0;
do { bool busyloop_intr = false; @@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) vhost_tx_batch(net, nvq, sock, &msg);
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, - &busyloop_intr); + &busyloop_intr, &ndesc); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; @@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) goto done; } else if (unlikely(err != -ENOSPC)) { vhost_tx_batch(net, nvq, sock, &msg); - vhost_discard_vq_desc(vq, 1); + vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; } @@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { - vhost_discard_vq_desc(vq, 1); + vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; } @@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) int err; struct vhost_net_ubuf_ref *ubufs; struct ubuf_info_msgzc *ubuf; + unsigned int ndesc = 0; bool zcopy_used; int sent_pkts = 0;
@@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
busyloop_intr = false; head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, - &busyloop_intr); + &busyloop_intr, &ndesc); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; @@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } if (retry) { - vhost_discard_vq_desc(vq, 1); + vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; } @@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, - unsigned int quota) + unsigned int quota, + unsigned int *ndesc) { struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); - unsigned int out, in; + unsigned int out, in, desc_num, n = 0; int seg = 0; int headcount = 0; unsigned d; @@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, r = -ENOBUFS; goto err; } - r = vhost_get_vq_desc(vq, vq->iov + seg, - ARRAY_SIZE(vq->iov) - seg, &out, - &in, log, log_num); + r = vhost_get_vq_desc_n(vq, vq->iov + seg, + ARRAY_SIZE(vq->iov) - seg, &out, + &in, log, log_num, &desc_num); if (unlikely(r < 0)) goto err;
@@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, ++headcount; datalen -= len; seg += in; + n += desc_num; }
*iovcount = seg; @@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, nheads[0] = headcount; }
+ *ndesc = n; + return headcount; err: - vhost_discard_vq_desc(vq, headcount); + vhost_discard_vq_desc(vq, headcount, n); return r; }
@@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net) struct iov_iter fixup; __virtio16 num_buffers; int recv_pkts = 0; + unsigned int ndesc;
mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vhost_vq_get_backend(vq); @@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net) headcount = get_rx_bufs(nvq, vq->heads + count, vq->nheads + count, vhost_len, &in, vq_log, &log, - likely(mergeable) ? UIO_MAXIOV : 1); + likely(mergeable) ? UIO_MAXIOV : 1, + &ndesc); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; @@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net) if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); - vhost_discard_vq_desc(vq, headcount); + vhost_discard_vq_desc(vq, headcount, ndesc); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ @@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net) copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); - vhost_discard_vq_desc(vq, headcount); + vhost_discard_vq_desc(vq, headcount, ndesc); goto out; } nvq->done_idx += headcount; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8570fdf2e14a..a78226b37739 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2792,18 +2792,34 @@ static int get_indirect(struct vhost_virtqueue *vq, return 0; }
-/* This looks in the virtqueue and for the first available buffer, and converts - * it to an iovec for convenient access. Since descriptors consist of some - * number of output then some number of input descriptors, it's actually two - * iovecs, but we pack them into one and note how many of each there were. +/** + * vhost_get_vq_desc_n - Fetch the next available descriptor chain and build iovecs + * @vq: target virtqueue + * @iov: array that receives the scatter/gather segments + * @iov_size: capacity of @iov in elements + * @out_num: the number of output segments + * @in_num: the number of input segments + * @log: optional array to record addr/len for each writable segment; NULL if unused + * @log_num: optional output; number of entries written to @log when provided + * @ndesc: optional output; number of descriptors consumed from the available ring + * (useful for rollback via vhost_discard_vq_desc) * - * This function returns the descriptor number found, or vq->num (which is - * never a valid descriptor number) if none was found. A negative code is - * returned on error. */ -int vhost_get_vq_desc(struct vhost_virtqueue *vq, - struct iovec iov[], unsigned int iov_size, - unsigned int *out_num, unsigned int *in_num, - struct vhost_log *log, unsigned int *log_num) + * Extracts one available descriptor chain from @vq and translates guest addresses + * into host iovecs. + * + * On success, advances @vq->last_avail_idx by 1 and @vq->next_avail_head by the + * number of descriptors consumed (also stored via @ndesc when non-NULL). + * + * Return: + * - head index in [0, @vq->num) on success; + * - @vq->num if no descriptor is currently available; + * - negative errno on failure + */ +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num, + unsigned int *ndesc) { bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); struct vring_desc desc; @@ -2921,17 +2937,49 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, vq->last_avail_idx++; vq->next_avail_head += c;
+ if (ndesc) + *ndesc = c; + /* Assume notifications from guest are disabled at this point, * if they aren't we would need to update avail_event index. */ BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY)); return head; } +EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n); + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which is + * never a valid descriptor number) if none was found. A negative code is + * returned on error. + */ +int vhost_get_vq_desc(struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num) +{ + return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num, + log, log_num, NULL); +} EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
-/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ -void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) +/** + * vhost_discard_vq_desc - Reverse the effect of vhost_get_vq_desc_n() + * @vq: target virtqueue + * @nbufs: number of buffers to roll back + * @ndesc: number of descriptors to roll back + * + * Rewinds the internal consumer cursors after a failed attempt to use buffers + * returned by vhost_get_vq_desc_n(). + */ +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int nbufs, + unsigned int ndesc) { - vq->last_avail_idx -= n; + vq->next_avail_head -= ndesc; + vq->last_avail_idx -= nbufs; } EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 621a6d9a8791..b49f08e4a1b4 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -230,7 +230,15 @@ int vhost_get_vq_desc(struct vhost_virtqueue *, struct iovec iov[], unsigned int iov_size, unsigned int *out_num, unsigned int *in_num, struct vhost_log *log, unsigned int *log_num); -void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); + +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num, + unsigned int *ndesc); + +void vhost_discard_vq_desc(struct vhost_virtqueue *, int nbuf, + unsigned int ndesc);
bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work); bool vhost_vq_has_work(struct vhost_virtqueue *vq);
On Thu, 20 Nov 2025 10:29:50 +0800 Jason Wang wrote:
Subject: [PATCH net V2] vhost: rewind next_avail_head while discarding descriptors
drivers/vhost/net.c | 53 ++++++++++++++++++------------ drivers/vhost/vhost.c | 76 +++++++++++++++++++++++++++++++++++-------- drivers/vhost/vhost.h | 10 +++++-
Hm, is this targeting net because Michael is not planning any more PRs for the 6.18 season?
On Wed, Nov 26, 2025 at 11:42 AM Jakub Kicinski kuba@kernel.org wrote:
On Thu, 20 Nov 2025 10:29:50 +0800 Jason Wang wrote:
Subject: [PATCH net V2] vhost: rewind next_avail_head while discarding descriptors
drivers/vhost/net.c | 53 ++++++++++++++++++------------ drivers/vhost/vhost.c | 76 +++++++++++++++++++++++++++++++++++-------- drivers/vhost/vhost.h | 10 +++++-
Hm, is this targeting net because Michael is not planning any more PRs for the 6.18 season?
Basically because it touches vhost-net. I need inputs for which tree we should go for this and future modifications that touch both vhost core and vhost-net.
Thanks
On Thu, Nov 20, 2025 at 10:29:50AM +0800, Jason Wang wrote:
When discarding descriptors with IN_ORDER, we should rewind next_avail_head otherwise it would run out of sync with last_avail_idx. This would cause driver to report "id X is not a head".
Fixing this by returning the number of descriptors that is used for each buffer via vhost_get_vq_desc_n() so caller can use the value while discarding descriptors.
Fixes: 67a873df0c41 ("vhost: basic in order support") Cc: stable@vger.kernel.org Signed-off-by: Jason Wang jasowang@redhat.com
Acked-by: Michael S. Tsirkin mst@redhat.com
Changes since V1:
- Add the function document
- Tweak the variable name
drivers/vhost/net.c | 53 ++++++++++++++++++------------ drivers/vhost/vhost.c | 76 +++++++++++++++++++++++++++++++++++-------- drivers/vhost/vhost.h | 10 +++++- 3 files changed, 103 insertions(+), 36 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 35ded4330431..8f7f50acb6d6 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net, static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num,
struct msghdr *msghdr, bool *busyloop_intr)
struct msghdr *msghdr, bool *busyloop_intr,unsigned int *ndesc){ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq;
- int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
out_num, in_num, NULL, NULL);
- int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
out_num, in_num, NULL, NULL, ndesc);if (r == tvq->num && tvq->busyloop_timeout) { /* Flush batched packets first */ @@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net, vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),out_num, in_num, NULL, NULL);
r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), }out_num, in_num, NULL, NULL, ndesc);return r; @@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct msghdr *msg, unsigned int *out, unsigned int *in,
size_t *len, bool *busyloop_intr)
size_t *len, bool *busyloop_intr,unsigned int *ndesc){ struct vhost_virtqueue *vq = &nvq->vq; int ret;
- ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
- ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
busyloop_intr, ndesc);if (ret < 0 || ret == vq->num) return ret; @@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
- unsigned int ndesc = 0;
do { bool busyloop_intr = false; @@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) vhost_tx_batch(net, nvq, sock, &msg); head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
&busyloop_intr);
/* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break;&busyloop_intr, &ndesc);@@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) goto done; } else if (unlikely(err != -ENOSPC)) { vhost_tx_batch(net, nvq, sock, &msg);
vhost_discard_vq_desc(vq, 1);
vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; }@@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
vhost_discard_vq_desc(vq, 1);
vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; }@@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) int err; struct vhost_net_ubuf_ref *ubufs; struct ubuf_info_msgzc *ubuf;
- unsigned int ndesc = 0; bool zcopy_used; int sent_pkts = 0;
@@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) busyloop_intr = false; head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
&busyloop_intr);
/* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break;&busyloop_intr, &ndesc);@@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } if (retry) {
vhost_discard_vq_desc(vq, 1);
vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; }@@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, unsigned *iovcount, struct vhost_log *log, unsigned *log_num,
unsigned int quota)
unsigned int quota,unsigned int *ndesc){ struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
- unsigned int out, in;
- unsigned int out, in, desc_num, n = 0; int seg = 0; int headcount = 0; unsigned d;
@@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, r = -ENOBUFS; goto err; }
r = vhost_get_vq_desc(vq, vq->iov + seg,ARRAY_SIZE(vq->iov) - seg, &out,&in, log, log_num);
r = vhost_get_vq_desc_n(vq, vq->iov + seg,ARRAY_SIZE(vq->iov) - seg, &out, if (unlikely(r < 0)) goto err;&in, log, log_num, &desc_num);@@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, ++headcount; datalen -= len; seg += in;
}n += desc_num;*iovcount = seg; @@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, nheads[0] = headcount; }
- *ndesc = n;
- return headcount;
err:
- vhost_discard_vq_desc(vq, headcount);
- vhost_discard_vq_desc(vq, headcount, n); return r;
} @@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net) struct iov_iter fixup; __virtio16 num_buffers; int recv_pkts = 0;
- unsigned int ndesc;
mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vhost_vq_get_backend(vq); @@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net) headcount = get_rx_bufs(nvq, vq->heads + count, vq->nheads + count, vhost_len, &in, vq_log, &log,
likely(mergeable) ? UIO_MAXIOV : 1);
likely(mergeable) ? UIO_MAXIOV : 1, /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out;&ndesc);@@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net) if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len);
vhost_discard_vq_desc(vq, headcount);
} /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */vhost_discard_vq_desc(vq, headcount, ndesc); continue;@@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net) copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write");
vhost_discard_vq_desc(vq, headcount);
} nvq->done_idx += headcount;vhost_discard_vq_desc(vq, headcount, ndesc); goto out;diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8570fdf2e14a..a78226b37739 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2792,18 +2792,34 @@ static int get_indirect(struct vhost_virtqueue *vq, return 0; } -/* This looks in the virtqueue and for the first available buffer, and converts
- it to an iovec for convenient access. Since descriptors consist of some
- number of output then some number of input descriptors, it's actually two
- iovecs, but we pack them into one and note how many of each there were.
+/**
- vhost_get_vq_desc_n - Fetch the next available descriptor chain and build iovecs
- @vq: target virtqueue
- @iov: array that receives the scatter/gather segments
- @iov_size: capacity of @iov in elements
- @out_num: the number of output segments
- @in_num: the number of input segments
- @log: optional array to record addr/len for each writable segment; NULL if unused
- @log_num: optional output; number of entries written to @log when provided
- @ndesc: optional output; number of descriptors consumed from the available ring
(useful for rollback via vhost_discard_vq_desc)
- This function returns the descriptor number found, or vq->num (which is
- never a valid descriptor number) if none was found. A negative code is
- returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,unsigned int *out_num, unsigned int *in_num,struct vhost_log *log, unsigned int *log_num)
- Extracts one available descriptor chain from @vq and translates guest addresses
- into host iovecs.
- On success, advances @vq->last_avail_idx by 1 and @vq->next_avail_head by the
- number of descriptors consumed (also stored via @ndesc when non-NULL).
- Return:
- head index in [0, @vq->num) on success;
- @vq->num if no descriptor is currently available;
- negative errno on failure
- */
+int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,unsigned int *out_num, unsigned int *in_num,struct vhost_log *log, unsigned int *log_num,unsigned int *ndesc){ bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); struct vring_desc desc; @@ -2921,17 +2937,49 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, vq->last_avail_idx++; vq->next_avail_head += c;
- if (ndesc)
*ndesc = c;- /* Assume notifications from guest are disabled at this point,
BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY)); return head;
- if they aren't we would need to update avail_event index. */
} +EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
+/* This looks in the virtqueue and for the first available buffer, and converts
- it to an iovec for convenient access. Since descriptors consist of some
- number of output then some number of input descriptors, it's actually two
- iovecs, but we pack them into one and note how many of each there were.
- This function returns the descriptor number found, or vq->num (which is
- never a valid descriptor number) if none was found. A negative code is
- returned on error.
- */
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,unsigned int *out_num, unsigned int *in_num,struct vhost_log *log, unsigned int *log_num)+{
- return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
log, log_num, NULL);+} EXPORT_SYMBOL_GPL(vhost_get_vq_desc); -/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ -void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) +/**
- vhost_discard_vq_desc - Reverse the effect of vhost_get_vq_desc_n()
- @vq: target virtqueue
- @nbufs: number of buffers to roll back
- @ndesc: number of descriptors to roll back
- Rewinds the internal consumer cursors after a failed attempt to use buffers
- returned by vhost_get_vq_desc_n().
- */
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int nbufs,
unsigned int ndesc){
- vq->last_avail_idx -= n;
- vq->next_avail_head -= ndesc;
- vq->last_avail_idx -= nbufs;
} EXPORT_SYMBOL_GPL(vhost_discard_vq_desc); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 621a6d9a8791..b49f08e4a1b4 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -230,7 +230,15 @@ int vhost_get_vq_desc(struct vhost_virtqueue *, struct iovec iov[], unsigned int iov_size, unsigned int *out_num, unsigned int *in_num, struct vhost_log *log, unsigned int *log_num); -void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
+int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,unsigned int *out_num, unsigned int *in_num,struct vhost_log *log, unsigned int *log_num,unsigned int *ndesc);+void vhost_discard_vq_desc(struct vhost_virtqueue *, int nbuf,
unsigned int ndesc);bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work); bool vhost_vq_has_work(struct vhost_virtqueue *vq); -- 2.31.1
On Wed, Nov 26, 2025 at 02:18:25PM +0800, Jason Wang wrote:
On Wed, Nov 26, 2025 at 11:42 AM Jakub Kicinski kuba@kernel.org wrote:
On Thu, 20 Nov 2025 10:29:50 +0800 Jason Wang wrote:
Subject: [PATCH net V2] vhost: rewind next_avail_head while discarding descriptors
drivers/vhost/net.c | 53 ++++++++++++++++++------------ drivers/vhost/vhost.c | 76 +++++++++++++++++++++++++++++++++++-------- drivers/vhost/vhost.h | 10 +++++-
Hm, is this targeting net because Michael is not planning any more PRs for the 6.18 season?
Basically because it touches vhost-net. I need inputs for which tree we should go for this and future modifications that touch both vhost core and vhost-net.
Thanks
Well this change is mostly net, vhost changes are just moving code around. net tree gets more testing and more eyes looking at it, so it's good for such cases.
Hello:
This patch was applied to netdev/net.git (main) by Jakub Kicinski kuba@kernel.org:
On Thu, 20 Nov 2025 10:29:50 +0800 you wrote:
When discarding descriptors with IN_ORDER, we should rewind next_avail_head otherwise it would run out of sync with last_avail_idx. This would cause driver to report "id X is not a head".
Fixing this by returning the number of descriptors that is used for each buffer via vhost_get_vq_desc_n() so caller can use the value while discarding descriptors.
[...]
Here is the summary with links: - [net,V2] vhost: rewind next_avail_head while discarding descriptors https://git.kernel.org/netdev/net/c/779bcdd4b9ae
You are awesome, thank you!
linux-stable-mirror@lists.linaro.org