vhost_get_vq_desc (correctly) uses smp_rmb to order avail ring reads after index reads. However, over time we added two more places that read the index and do not bother with barriers. Since vhost_get_vq_desc when it was written assumed it is the only reader when it sees a new index value is cached it does not bother with a barrier either, as a result, on the nvidia-gracehopper platform (arm64) available ring entry reads have been observed bypassing ring reads, causing a ring corruption.
To fix, factor out the correct index access code from vhost_get_vq_desc. As a side benefit, we also validate the index on all paths now, which will hopefully help catch future errors earlier.
Note: current code is inconsistent in how it handles errors: some places treat it as an empty ring, others - non empty. This patch does not attempt to change the existing behaviour.
Cc: stable@vger.kernel.org Reported-by: Gavin Shan gshan@redhat.com Reported-by: Will Deacon will@kernel.org Suggested-by: Will Deacon will@kernel.org Fixes: 275bf960ac69 ("vhost: better detection of available buffers") Cc: "Jason Wang" jasowang@redhat.com Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: "Stefano Garzarella" sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin mst@redhat.com ---
I think it's better to bite the bullet and clean up the code. Note: this is still only built, not tested. Gavin could you help test please? Especially on the arm platform you have?
Will thanks so much for finding this race!
drivers/vhost/vhost.c | 80 +++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 38 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..26b70b1fd9ff 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,38 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(&d->vqs[i]->mutex); }
-static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, &vq->avail->idx); + __virtio16 idx; + u16 avail_idx; + int r = vhost_get_avail(vq, idx, &vq->avail->idx); + + if (unlikely(r < 0)) { + vq_err(vq, "Failed to access avail idx at %p: %d\n", + &vq->avail->idx, r); + return -EFAULT; + } + + avail_idx = vhost16_to_cpu(vq, idx); + + /* Check it isn't doing very strange things with descriptor numbers. */ + if (unlikely((u16)(avail_idx - vq->last_avail_idx) > vq->num)) { + vq_err(vq, "Guest moved used index from %u to %u", + vq->last_avail_idx, vq->avail_idx); + return -EFAULT; + } + + /* Nothing new? We are done. */ + if (avail_idx == vq->avail_idx) + return 0; + + vq->avail_idx = avail_idx; + + /* We updated vq->avail_idx so we need a memory barrier between + * the index read above and the caller reading avail ring entries. + */ + smp_rmb(); + return 1; }
static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2498,38 +2526,21 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx; - __virtio16 avail_idx; + u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; int ret, access;
- /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vq->last_avail_idx;
if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - &vq->avail->idx); - return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved used index from %u to %u", - last_avail_idx, vq->avail_idx); - return -EFAULT; - } + ret = vhost_get_avail_idx(vq); + if (unlikely(ret < 0)) + return ret;
/* If there's nothing new since last we looked, return * invalid. */ - if (vq->avail_idx == last_avail_idx) + if (!ret) return vq->num; - - /* Only get avail ring entries after they have been - * exposed by guest. - */ - smp_rmb(); }
/* Grab the next descriptor number they're advertising, and increment @@ -2790,25 +2801,21 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r;
if (vq->avail_idx != vq->last_avail_idx) return false;
- r = vhost_get_avail_idx(vq, &avail_idx); - if (unlikely(r)) - return false; - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + r = vhost_get_avail_idx(vq);
- return vq->avail_idx == vq->last_avail_idx; + /* Note: we treat error as non-empty here */ + return r == 0; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
/* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r;
if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) @@ -2832,13 +2839,10 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) /* They could have slipped one in as we were doing that: make * sure it's written, then check again. */ smp_mb(); - r = vhost_get_avail_idx(vq, &avail_idx); - if (r) { - vq_err(vq, "Failed to check avail idx at %p: %d\n", - &vq->avail->idx, r); + r = vhost_get_avail_idx(vq); + /* Note: we treat error as empty here */ + if (r < 0) return false; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
return vq->avail_idx != vq->last_avail_idx; }