On Wed, Jul 03, 2019 at 10:01:37AM -0700, Dan Williams wrote:
On Wed, Jul 3, 2019 at 5:17 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Jul 03, 2019 at 12:24:54AM -0700, Dan Williams wrote:
This fix may increase waitqueue contention, but a fix for that is saved for a larger rework. In the meantime this fix is suitable for -stable backports.
I think this is too big for what it is; just the two-line patch to stop incorporating the low bits of the PTE would be more appropriate.
Sufficient, yes, "appropriate", not so sure. All those comments about pmd entry size are stale after this change.
But then they'll have to be put back in again. This seems to be working for me, although I doubt I'm actually hitting the edge case that rocksdb hits:
diff --git a/fs/dax.c b/fs/dax.c index 2e48c7ebb973..e77bd6aef10c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -198,6 +198,10 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) * if it did. * * Must be called with the i_pages lock held. + * + * If the xa_state refers to a larger entry, then it may return a locked + * smaller entry (eg a PTE entry) without waiting for the smaller entry + * to be unlocked. */ static void *get_unlocked_entry(struct xa_state *xas) { @@ -211,7 +215,8 @@ static void *get_unlocked_entry(struct xa_state *xas) for (;;) { entry = xas_find_conflict(xas); if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || - !dax_is_locked(entry)) + !dax_is_locked(entry) || + dax_entry_order(entry) < xas_get_order(xas)) return entry;
wq = dax_entry_waitqueue(xas, entry, &ewait.key); @@ -253,8 +258,12 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
static void put_unlocked_entry(struct xa_state *xas, void *entry) { - /* If we were the only waiter woken, wake the next one */ - if (entry) + /* + * If we were the only waiter woken, wake the next one. + * Do not wake anybody if the entry is locked; that indicates + * we weren't woken. + */ + if (entry && !dax_is_locked(entry)) dax_wake_entry(xas, entry, false); }
diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 052e06ff4c36..b17289d92af4 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1529,6 +1529,27 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index, #endif }
+/** + * xas_get_order() - Get the order of the entry being operated on. + * @xas: XArray operation state. + * + * Return: The order of the entry. + */ +static inline unsigned int xas_get_order(const struct xa_state *xas) +{ + unsigned int order = xas->xa_shift; + +#ifdef CONFIG_XARRAY_MULTI + unsigned int sibs = xas->xa_sibs; + + while (sibs) { + order++; + sibs /= 2; + } +#endif + return order; +} + /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 7df4f7f395bf..af024477ec93 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -95,6 +95,17 @@ static noinline void check_xa_err(struct xarray *xa) // XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL); }
+static noinline void check_xas_order(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + unsigned int i; + + for (i = 0; i < sizeof(long) * 8; i++) { + xas_set_order(&xas, 0, i); + XA_BUG_ON(xa, xas_get_order(&xas) != i); + } +} + static noinline void check_xas_retry(struct xarray *xa) { XA_STATE(xas, xa, 0); @@ -1583,6 +1594,7 @@ static DEFINE_XARRAY(array); static int xarray_checks(void) { check_xa_err(&array); + check_xas_order(&array); check_xas_retry(&array); check_xa_load(&array); check_xa_mark(&array);