vhost_get_vq_desc (correctly) uses smp_rmb to order avail ring reads after index reads. However, over time we added two more places that read the index and do not bother with barriers. Since vhost_get_vq_desc when it was written assumed it is the only reader when it sees a new index value is cached it does not bother with a barrier either, as a result, on the nvidia-gracehopper platform (arm64) available ring entry reads have been observed bypassing ring reads, causing a ring corruption.
To fix, factor out the correct index access code from vhost_get_vq_desc. As a side benefit, we also validate the index on all paths now, which will hopefully help catch future errors earlier.
Note: current code is inconsistent in how it handles errors: some places treat it as an empty ring, others - non empty. This patch does not attempt to change the existing behaviour.
Cc: stable@vger.kernel.org Reported-by: Gavin Shan gshan@redhat.com Reported-by: Will Deacon will@kernel.org Suggested-by: Will Deacon will@kernel.org Fixes: 275bf960ac69 ("vhost: better detection of available buffers") Cc: "Jason Wang" jasowang@redhat.com Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: "Stefano Garzarella" sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin mst@redhat.com ---
I think it's better to bite the bullet and clean up the code. Note: this is still only built, not tested. Gavin could you help test please? Especially on the arm platform you have?
Will thanks so much for finding this race!
drivers/vhost/vhost.c | 80 +++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 38 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..26b70b1fd9ff 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,38 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(&d->vqs[i]->mutex); }
-static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, &vq->avail->idx); + __virtio16 idx; + u16 avail_idx; + int r = vhost_get_avail(vq, idx, &vq->avail->idx); + + if (unlikely(r < 0)) { + vq_err(vq, "Failed to access avail idx at %p: %d\n", + &vq->avail->idx, r); + return -EFAULT; + } + + avail_idx = vhost16_to_cpu(vq, idx); + + /* Check it isn't doing very strange things with descriptor numbers. */ + if (unlikely((u16)(avail_idx - vq->last_avail_idx) > vq->num)) { + vq_err(vq, "Guest moved used index from %u to %u", + vq->last_avail_idx, vq->avail_idx); + return -EFAULT; + } + + /* Nothing new? We are done. */ + if (avail_idx == vq->avail_idx) + return 0; + + vq->avail_idx = avail_idx; + + /* We updated vq->avail_idx so we need a memory barrier between + * the index read above and the caller reading avail ring entries. + */ + smp_rmb(); + return 1; }
static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2498,38 +2526,21 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx; - __virtio16 avail_idx; + u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; int ret, access;
- /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vq->last_avail_idx;
if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - &vq->avail->idx); - return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved used index from %u to %u", - last_avail_idx, vq->avail_idx); - return -EFAULT; - } + ret = vhost_get_avail_idx(vq); + if (unlikely(ret < 0)) + return ret;
/* If there's nothing new since last we looked, return * invalid. */ - if (vq->avail_idx == last_avail_idx) + if (!ret) return vq->num; - - /* Only get avail ring entries after they have been - * exposed by guest. - */ - smp_rmb(); }
/* Grab the next descriptor number they're advertising, and increment @@ -2790,25 +2801,21 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r;
if (vq->avail_idx != vq->last_avail_idx) return false;
- r = vhost_get_avail_idx(vq, &avail_idx); - if (unlikely(r)) - return false; - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + r = vhost_get_avail_idx(vq);
- return vq->avail_idx == vq->last_avail_idx; + /* Note: we treat error as non-empty here */ + return r == 0; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
/* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r;
if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) @@ -2832,13 +2839,10 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) /* They could have slipped one in as we were doing that: make * sure it's written, then check again. */ smp_mb(); - r = vhost_get_avail_idx(vq, &avail_idx); - if (r) { - vq_err(vq, "Failed to check avail idx at %p: %d\n", - &vq->avail->idx, r); + r = vhost_get_avail_idx(vq); + /* Note: we treat error as empty here */ + if (r < 0) return false; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
return vq->avail_idx != vq->last_avail_idx; }
On Wed, Mar 27, 2024 at 01:26:23PM -0400, Michael S. Tsirkin wrote:
vhost_get_vq_desc (correctly) uses smp_rmb to order avail ring reads after index reads. However, over time we added two more places that read the index and do not bother with barriers. Since vhost_get_vq_desc when it was written assumed it is the only reader when it sees a new index value is cached it does not bother with a barrier either, as a result, on the nvidia-gracehopper platform (arm64) available ring entry reads have been observed bypassing ring reads, causing a ring corruption.
To fix, factor out the correct index access code from vhost_get_vq_desc. As a side benefit, we also validate the index on all paths now, which will hopefully help catch future errors earlier.
Note: current code is inconsistent in how it handles errors: some places treat it as an empty ring, others - non empty. This patch does not attempt to change the existing behaviour.
Cc: stable@vger.kernel.org Reported-by: Gavin Shan gshan@redhat.com Reported-by: Will Deacon will@kernel.org Suggested-by: Will Deacon will@kernel.org Fixes: 275bf960ac69 ("vhost: better detection of available buffers") Cc: "Jason Wang" jasowang@redhat.com Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: "Stefano Garzarella" sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin mst@redhat.com
I think it's better to bite the bullet and clean up the code. Note: this is still only built, not tested. Gavin could you help test please? Especially on the arm platform you have?
Will thanks so much for finding this race!
No problem, and I was also hoping that the smp_rmb() could be consolidated into a single helper like you've done here.
One minor comment below:
drivers/vhost/vhost.c | 80 +++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 38 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..26b70b1fd9ff 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,38 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(&d->vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
__virtio16 *idx)
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) {
- return vhost_get_avail(vq, *idx, &vq->avail->idx);
- __virtio16 idx;
- u16 avail_idx;
- int r = vhost_get_avail(vq, idx, &vq->avail->idx);
- if (unlikely(r < 0)) {
vq_err(vq, "Failed to access avail idx at %p: %d\n",
&vq->avail->idx, r);
return -EFAULT;
- }
- avail_idx = vhost16_to_cpu(vq, idx);
- /* Check it isn't doing very strange things with descriptor numbers. */
- if (unlikely((u16)(avail_idx - vq->last_avail_idx) > vq->num)) {
vq_err(vq, "Guest moved used index from %u to %u",
vq->last_avail_idx, vq->avail_idx);
return -EFAULT;
- }
- /* Nothing new? We are done. */
- if (avail_idx == vq->avail_idx)
return 0;
- vq->avail_idx = avail_idx;
- /* We updated vq->avail_idx so we need a memory barrier between
* the index read above and the caller reading avail ring entries.
*/
- smp_rmb();
I think you could use smp_acquire__after_ctrl_dep() if you're feeling brave, but to be honest I'd prefer we went in the opposite direction and used READ/WRITE_ONCE + smp_load_acquire()/smp_store_release() across the board. It's just a thankless, error-prone task to get there :(
So, for the patch as-is:
Acked-by: Will Deacon will@kernel.org
(I've not tested it either though, so definitely wait for Gavin on that!)
Cheers,
Will
On Wed, Mar 27, 2024 at 07:52:02PM +0000, Will Deacon wrote:
On Wed, Mar 27, 2024 at 01:26:23PM -0400, Michael S. Tsirkin wrote:
vhost_get_vq_desc (correctly) uses smp_rmb to order avail ring reads after index reads. However, over time we added two more places that read the index and do not bother with barriers. Since vhost_get_vq_desc when it was written assumed it is the only reader when it sees a new index value is cached it does not bother with a barrier either, as a result, on the nvidia-gracehopper platform (arm64) available ring entry reads have been observed bypassing ring reads, causing a ring corruption.
To fix, factor out the correct index access code from vhost_get_vq_desc. As a side benefit, we also validate the index on all paths now, which will hopefully help catch future errors earlier.
Note: current code is inconsistent in how it handles errors: some places treat it as an empty ring, others - non empty. This patch does not attempt to change the existing behaviour.
Cc: stable@vger.kernel.org Reported-by: Gavin Shan gshan@redhat.com Reported-by: Will Deacon will@kernel.org Suggested-by: Will Deacon will@kernel.org Fixes: 275bf960ac69 ("vhost: better detection of available buffers") Cc: "Jason Wang" jasowang@redhat.com Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: "Stefano Garzarella" sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin mst@redhat.com
I think it's better to bite the bullet and clean up the code. Note: this is still only built, not tested. Gavin could you help test please? Especially on the arm platform you have?
Will thanks so much for finding this race!
No problem, and I was also hoping that the smp_rmb() could be consolidated into a single helper like you've done here.
One minor comment below:
drivers/vhost/vhost.c | 80 +++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 38 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..26b70b1fd9ff 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,38 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(&d->vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
__virtio16 *idx)
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) {
- return vhost_get_avail(vq, *idx, &vq->avail->idx);
- __virtio16 idx;
- u16 avail_idx;
- int r = vhost_get_avail(vq, idx, &vq->avail->idx);
- if (unlikely(r < 0)) {
vq_err(vq, "Failed to access avail idx at %p: %d\n",
&vq->avail->idx, r);
return -EFAULT;
- }
- avail_idx = vhost16_to_cpu(vq, idx);
- /* Check it isn't doing very strange things with descriptor numbers. */
- if (unlikely((u16)(avail_idx - vq->last_avail_idx) > vq->num)) {
vq_err(vq, "Guest moved used index from %u to %u",
vq->last_avail_idx, vq->avail_idx);
return -EFAULT;
- }
- /* Nothing new? We are done. */
- if (avail_idx == vq->avail_idx)
return 0;
- vq->avail_idx = avail_idx;
- /* We updated vq->avail_idx so we need a memory barrier between
* the index read above and the caller reading avail ring entries.
*/
- smp_rmb();
I think you could use smp_acquire__after_ctrl_dep() if you're feeling brave, but to be honest I'd prefer we went in the opposite direction and used READ/WRITE_ONCE + smp_load_acquire()/smp_store_release() across the board. It's just a thankless, error-prone task to get there :(
Let's just say that's a separate patch, I tried hard to make this one a bugfix only, no other functional changes at all.
So, for the patch as-is:
Acked-by: Will Deacon will@kernel.org
(I've not tested it either though, so definitely wait for Gavin on that!)
Cheers,
Will
linux-stable-mirror@lists.linaro.org