Commit a59a27176914 ("drm/bridge: tc358767: convert to
devm_drm_bridge_alloc() API") split tc_probe_bridge_endpoint() in two
functions, thus duplicating the loop over the endpoints in each of those
functions. However it missed duplicating the of_graph_parse_endpoint() call
which initializes the struct of_endpoint, resulting in an uninitialized
read.
Fixes: a59a27176914 ("drm/bridge: tc358767: convert to devm_drm_bridge_alloc() API")
Cc: stable(a)vger.kernel.org
Reported-by: Colin King (gmail) <colin.i.king(a)gmail.com>
Closes: https://lore.kernel.org/all/056b34c3-c1ea-4b8c-9672-c98903ffd012@gmail.com/
Reviewed-by: Colin Ian King <colin.i.king(a)gmail.com>
Signed-off-by: Luca Ceresoli <luca.ceresoli(a)bootlin.com>
---
- Link to v1: https://lore.kernel.org/r/20250703-drm-bridge-alloc-fix-tc358767-regression…
---
Changes in v2:
- fix 'Closes:' tag URL
- add 'Cc: stable(a)vger.kernel.org'
---
drivers/gpu/drm/bridge/tc358767.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c
index 61559467e2d22b4b1b4223c97766ca3bf58908fd..562fea47b3ecf360e64a414e95ab5d645e610e9e 100644
--- a/drivers/gpu/drm/bridge/tc358767.c
+++ b/drivers/gpu/drm/bridge/tc358767.c
@@ -2422,6 +2422,7 @@ static int tc_probe_bridge_endpoint(struct tc_data *tc, enum tc_mode mode)
struct device_node *node = NULL;
for_each_endpoint_of_node(dev->of_node, node) {
+ of_graph_parse_endpoint(node, &endpoint);
if (endpoint.port == 2) {
of_property_read_u8_array(node, "toshiba,pre-emphasis",
tc->pre_emphasis,
---
base-commit: b4cd18f485687a2061ee7a0ce6833851fc4438da
change-id: 20250703-drm-bridge-alloc-fix-tc358767-regression-536ea2861af6
Best regards,
--
Luca Ceresoli <luca.ceresoli(a)bootlin.com>
Acquire GEM handles in drm_framebuffer_init() and release them in
the corresponding drm_framebuffer_cleanup(). Ties the handle's
lifetime to the framebuffer. Not all GEM buffer objects have GEM
handles. If not set, no refcounting takes place. This is the case
for some fbdev emulation. This is not a problem as these GEM objects
do not use dma-bufs and drivers will not release them while fbdev
emulation is running. Framebuffer flags keep a bit per color plane
of which the framebuffer holds a GEM handle reference.
As all drivers use drm_framebuffer_init(), they will now all hold
dma-buf references as fixed in commit 5307dce878d4 ("drm/gem: Acquire
references on GEM handles for framebuffers").
In the GEM framebuffer helpers, restore the original ref counting
on buffer objects. As the helpers for handle refcounting are now
no longer called from outside the DRM core, unexport the symbols.
v2:
- track framebuffer handle refs by flag
- drop gma500 cleanup (Christian)
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Fixes: 5307dce878d4 ("drm/gem: Acquire references on GEM handles for framebuffers")
Reported-by: Bert Karwatzki <spasswolf(a)web.de>
Closes: https://lore.kernel.org/dri-devel/20250703115915.3096-1-spasswolf@web.de/
Tested-by: Bert Karwatzki <spasswolf(a)web.de>
Tested-by: Mario Limonciello <superm1(a)kernel.org>
Cc: Thomas Zimmermann <tzimmermann(a)suse.de>
Cc: Anusha Srivatsa <asrivats(a)redhat.com>
Cc: Christian König <christian.koenig(a)amd.com>
Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com>
Cc: Maxime Ripard <mripard(a)kernel.org>
Cc: Sumit Semwal <sumit.semwal(a)linaro.org>
Cc: "Christian König" <christian.koenig(a)amd.com>
Cc: linux-media(a)vger.kernel.org
Cc: dri-devel(a)lists.freedesktop.org
Cc: linaro-mm-sig(a)lists.linaro.org
Cc: <stable(a)vger.kernel.org>
---
drivers/gpu/drm/drm_framebuffer.c | 31 ++++++++++++++--
drivers/gpu/drm/drm_gem.c | 38 ++++++++++++--------
drivers/gpu/drm/drm_gem_framebuffer_helper.c | 16 ++++-----
drivers/gpu/drm/drm_internal.h | 2 +-
drivers/gpu/drm/drm_modeset_helper.c | 2 +-
include/drm/drm_framebuffer.h | 9 +++++
6 files changed, 71 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/drm_framebuffer.c b/drivers/gpu/drm/drm_framebuffer.c
index b781601946db..23b56cde21d7 100644
--- a/drivers/gpu/drm/drm_framebuffer.c
+++ b/drivers/gpu/drm/drm_framebuffer.c
@@ -862,11 +862,23 @@ EXPORT_SYMBOL_FOR_TESTS_ONLY(drm_framebuffer_free);
int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb,
const struct drm_framebuffer_funcs *funcs)
{
+ unsigned int i;
int ret;
+ bool exists;
if (WARN_ON_ONCE(fb->dev != dev || !fb->format))
return -EINVAL;
+ for (i = 0; i < fb->format->num_planes; i++) {
+ if (drm_WARN_ON_ONCE(dev, fb->flags & DRM_FRAMEBUFFER_HAS_HANDLE_REF(i)))
+ fb->flags &= ~DRM_FRAMEBUFFER_HAS_HANDLE_REF(i);
+ if (fb->obj[i]) {
+ exists = drm_gem_object_handle_get_if_exists_unlocked(fb->obj[i]);
+ if (exists)
+ fb->flags |= DRM_FRAMEBUFFER_HAS_HANDLE_REF(i);
+ }
+ }
+
INIT_LIST_HEAD(&fb->filp_head);
fb->funcs = funcs;
@@ -875,7 +887,7 @@ int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb,
ret = __drm_mode_object_add(dev, &fb->base, DRM_MODE_OBJECT_FB,
false, drm_framebuffer_free);
if (ret)
- goto out;
+ goto err;
mutex_lock(&dev->mode_config.fb_lock);
dev->mode_config.num_fb++;
@@ -883,7 +895,16 @@ int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb,
mutex_unlock(&dev->mode_config.fb_lock);
drm_mode_object_register(dev, &fb->base);
-out:
+
+ return 0;
+
+err:
+ for (i = 0; i < fb->format->num_planes; i++) {
+ if (fb->flags & DRM_FRAMEBUFFER_HAS_HANDLE_REF(i)) {
+ drm_gem_object_handle_put_unlocked(fb->obj[i]);
+ fb->flags &= ~DRM_FRAMEBUFFER_HAS_HANDLE_REF(i);
+ }
+ }
return ret;
}
EXPORT_SYMBOL(drm_framebuffer_init);
@@ -960,6 +981,12 @@ EXPORT_SYMBOL(drm_framebuffer_unregister_private);
void drm_framebuffer_cleanup(struct drm_framebuffer *fb)
{
struct drm_device *dev = fb->dev;
+ unsigned int i;
+
+ for (i = 0; i < fb->format->num_planes; i++) {
+ if (fb->flags & DRM_FRAMEBUFFER_HAS_HANDLE_REF(i))
+ drm_gem_object_handle_put_unlocked(fb->obj[i]);
+ }
mutex_lock(&dev->mode_config.fb_lock);
list_del(&fb->head);
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index bc505d938b3e..41cdab6088ae 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -224,23 +224,34 @@ static void drm_gem_object_handle_get(struct drm_gem_object *obj)
}
/**
- * drm_gem_object_handle_get_unlocked - acquire reference on user-space handles
+ * drm_gem_object_handle_get_if_exists_unlocked - acquire reference on user-space handle, if any
* @obj: GEM object
*
- * Acquires a reference on the GEM buffer object's handle. Required
- * to keep the GEM object alive. Call drm_gem_object_handle_put_unlocked()
- * to release the reference.
+ * Acquires a reference on the GEM buffer object's handle. Required to keep
+ * the GEM object alive. Call drm_gem_object_handle_put_if_exists_unlocked()
+ * to release the reference. Does nothing if the buffer object has no handle.
+ *
+ * Returns:
+ * True if a handle exists, or false otherwise
*/
-void drm_gem_object_handle_get_unlocked(struct drm_gem_object *obj)
+bool drm_gem_object_handle_get_if_exists_unlocked(struct drm_gem_object *obj)
{
struct drm_device *dev = obj->dev;
guard(mutex)(&dev->object_name_lock);
- drm_WARN_ON(dev, !obj->handle_count); /* first ref taken in create-tail helper */
+ /*
+ * First ref taken during GEM object creation, if any. Some
+ * drivers set up internal framebuffers with GEM objects that
+ * do not have a GEM handle. Hence, this counter can be zero.
+ */
+ if (!obj->handle_count)
+ return false;
+
drm_gem_object_handle_get(obj);
+
+ return true;
}
-EXPORT_SYMBOL(drm_gem_object_handle_get_unlocked);
/**
* drm_gem_object_handle_free - release resources bound to userspace handles
@@ -273,7 +284,7 @@ static void drm_gem_object_exported_dma_buf_free(struct drm_gem_object *obj)
}
/**
- * drm_gem_object_handle_put_unlocked - releases reference on user-space handles
+ * drm_gem_object_handle_put_unlocked - releases reference on user-space handle
* @obj: GEM object
*
* Releases a reference on the GEM buffer object's handle. Possibly releases
@@ -284,14 +295,14 @@ void drm_gem_object_handle_put_unlocked(struct drm_gem_object *obj)
struct drm_device *dev = obj->dev;
bool final = false;
- if (WARN_ON(READ_ONCE(obj->handle_count) == 0))
+ if (drm_WARN_ON(dev, READ_ONCE(obj->handle_count) == 0))
return;
/*
- * Must bump handle count first as this may be the last
- * ref, in which case the object would disappear before we
- * checked for a name
- */
+ * Must bump handle count first as this may be the last
+ * ref, in which case the object would disappear before
+ * we checked for a name.
+ */
mutex_lock(&dev->object_name_lock);
if (--obj->handle_count == 0) {
@@ -304,7 +315,6 @@ void drm_gem_object_handle_put_unlocked(struct drm_gem_object *obj)
if (final)
drm_gem_object_put(obj);
}
-EXPORT_SYMBOL(drm_gem_object_handle_put_unlocked);
/*
* Called at device or object close to release the file's
diff --git a/drivers/gpu/drm/drm_gem_framebuffer_helper.c b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
index c60d0044d036..618ce725cd75 100644
--- a/drivers/gpu/drm/drm_gem_framebuffer_helper.c
+++ b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
@@ -100,7 +100,7 @@ void drm_gem_fb_destroy(struct drm_framebuffer *fb)
unsigned int i;
for (i = 0; i < fb->format->num_planes; i++)
- drm_gem_object_handle_put_unlocked(fb->obj[i]);
+ drm_gem_object_put(fb->obj[i]);
drm_framebuffer_cleanup(fb);
kfree(fb);
@@ -183,10 +183,8 @@ int drm_gem_fb_init_with_funcs(struct drm_device *dev,
if (!objs[i]) {
drm_dbg_kms(dev, "Failed to lookup GEM object\n");
ret = -ENOENT;
- goto err_gem_object_handle_put_unlocked;
+ goto err_gem_object_put;
}
- drm_gem_object_handle_get_unlocked(objs[i]);
- drm_gem_object_put(objs[i]);
min_size = (height - 1) * mode_cmd->pitches[i]
+ drm_format_info_min_pitch(info, i, width)
@@ -196,22 +194,22 @@ int drm_gem_fb_init_with_funcs(struct drm_device *dev,
drm_dbg_kms(dev,
"GEM object size (%zu) smaller than minimum size (%u) for plane %d\n",
objs[i]->size, min_size, i);
- drm_gem_object_handle_put_unlocked(objs[i]);
+ drm_gem_object_put(objs[i]);
ret = -EINVAL;
- goto err_gem_object_handle_put_unlocked;
+ goto err_gem_object_put;
}
}
ret = drm_gem_fb_init(dev, fb, mode_cmd, objs, i, funcs);
if (ret)
- goto err_gem_object_handle_put_unlocked;
+ goto err_gem_object_put;
return 0;
-err_gem_object_handle_put_unlocked:
+err_gem_object_put:
while (i > 0) {
--i;
- drm_gem_object_handle_put_unlocked(objs[i]);
+ drm_gem_object_put(objs[i]);
}
return ret;
}
diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
index f921cc73f8b8..e79c3c623c9a 100644
--- a/drivers/gpu/drm/drm_internal.h
+++ b/drivers/gpu/drm/drm_internal.h
@@ -161,7 +161,7 @@ void drm_sysfs_lease_event(struct drm_device *dev);
/* drm_gem.c */
int drm_gem_init(struct drm_device *dev);
-void drm_gem_object_handle_get_unlocked(struct drm_gem_object *obj);
+bool drm_gem_object_handle_get_if_exists_unlocked(struct drm_gem_object *obj);
void drm_gem_object_handle_put_unlocked(struct drm_gem_object *obj);
int drm_gem_handle_create_tail(struct drm_file *file_priv,
struct drm_gem_object *obj,
diff --git a/drivers/gpu/drm/drm_modeset_helper.c b/drivers/gpu/drm/drm_modeset_helper.c
index ef32f6af10d4..1e8822c4b370 100644
--- a/drivers/gpu/drm/drm_modeset_helper.c
+++ b/drivers/gpu/drm/drm_modeset_helper.c
@@ -94,7 +94,7 @@ void drm_helper_mode_fill_fb_struct(struct drm_device *dev,
fb->offsets[i] = mode_cmd->offsets[i];
}
fb->modifier = mode_cmd->modifier[0];
- fb->flags = mode_cmd->flags;
+ fb->flags = mode_cmd->flags & DRM_FRAMEBUFFER_FLAGS_UAPI_MASK;
}
EXPORT_SYMBOL(drm_helper_mode_fill_fb_struct);
diff --git a/include/drm/drm_framebuffer.h b/include/drm/drm_framebuffer.h
index 668077009fce..11fa20d21c58 100644
--- a/include/drm/drm_framebuffer.h
+++ b/include/drm/drm_framebuffer.h
@@ -23,6 +23,7 @@
#ifndef __DRM_FRAMEBUFFER_H__
#define __DRM_FRAMEBUFFER_H__
+#include <linux/bits.h>
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/sched.h>
@@ -100,6 +101,14 @@ struct drm_framebuffer_funcs {
unsigned num_clips);
};
+#define __DRM_FRAMEBUFFER_FLAGS_BIT_OFFSET 16
+
+#define DRM_FRAMEBUFFER_FLAGS_UAPI_MASK \
+ GENMASK(__DRM_FRAMEBUFFER_FLAGS_BIT_OFFSET - 1, 0)
+
+#define DRM_FRAMEBUFFER_HAS_HANDLE_REF(_i) \
+ BIT((__DRM_FRAMEBUFFER_FLAGS_BIT_OFFSET + (_i)))
+
/**
* struct drm_framebuffer - frame buffer object
*
--
2.50.0
Commit 48b4800a1c6a ("zsmalloc: page migration support") added support
for migrating zsmalloc pages using the movable_operations migration
framework. However, the commit did not take into account that zsmalloc
supports migration only when CONFIG_COMPACTION is enabled.
Tracing shows that zsmalloc was still passing the __GFP_MOVABLE flag
even when compaction is not supported.
This can result in unmovable pages being allocated from movable page
blocks (even without stealing page blocks), ZONE_MOVABLE and CMA area.
Clear the __GFP_MOVABLE flag when !IS_ENABLED(CONFIG_COMPACTION).
Cc: stable(a)vger.kernel.org
Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
Signed-off-by: Harry Yoo <harry.yoo(a)oracle.com>
---
mm/zsmalloc.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 999b513c7fdf..f3e2215f95eb 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1043,6 +1043,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
if (!zspage)
return NULL;
+ if (!IS_ENABLED(CONFIG_COMPACTION))
+ gfp &= ~__GFP_MOVABLE;
+
zspage->magic = ZSPAGE_MAGIC;
zspage->pool = pool;
zspage->class = class->index;
--
2.43.0
From: Francesco Dolcini <francesco.dolcini(a)toradex.com>
LDO5 regulator is used to power the i.MX8MM NVCC_SD2 I/O supply, that is
used for the SD2 card interface and also for some GPIOs.
When the SD card interface is not enabled the regulator subsystem could
turn off this supply, since it is not used anywhere else, however this
will also remove the power to some other GPIOs, for example one I/O that
is used to power the ethernet phy, leading to a non working ethernet
interface.
[ 31.820515] On-module +V3.3_1.8_SD (LDO5): disabling
[ 31.821761] PMIC_USDHC_VSELECT: disabling
[ 32.764949] fec 30be0000.ethernet end0: Link is Down
Fix this keeping the LDO5 supply always on.
Cc: stable(a)vger.kernel.org
Fixes: 6a57f224f734 ("arm64: dts: freescale: add initial support for verdin imx8m mini")
Fixes: f5aab0438ef1 ("regulator: pca9450: Fix enable register for LDO5")
Signed-off-by: Francesco Dolcini <francesco.dolcini(a)toradex.com>
---
arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
index d29710772569..1594ce9182a5 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
@@ -464,6 +464,7 @@ reg_vdd_phy: LDO4 {
};
reg_nvcc_sd: LDO5 {
+ regulator-always-on;
regulator-max-microvolt = <3300000>;
regulator-min-microvolt = <1800000>;
regulator-name = "On-module +V3.3_1.8_SD (LDO5)";
--
2.39.5
Commit 88c02b3f79a6 ("s390/sha3: Support sha3 performance enhancements")
added the field s390_sha_ctx::first_message_part and made it be used by
s390_sha_update_blocks(). At the time, s390_sha_update_blocks() was
used by all the s390 SHA-1, SHA-2, and SHA-3 algorithms. However, only
the initialization functions for SHA-3 were updated, leaving SHA-1 and
SHA-2 using first_message_part uninitialized.
This could cause e.g. CPACF_KIMD_SHA_512 | CPACF_KIMD_NIP to be used
instead of just CPACF_KIMD_NIP. It's unclear why this didn't cause a
problem earlier; this bug was found only when UBSAN detected the
uninitialized boolean. Perhaps the CPU ignores CPACF_KIMD_NIP for SHA-1
and SHA-2. Regardless, let's fix this. For now just initialize to
false, i.e. don't try to "optimize" the SHA state initialization.
Note: in 6.16, we need to patch SHA-1, SHA-384, and SHA-512. In 6.15
and earlier, we'll also need to patch SHA-224 and SHA-256, as they
hadn't yet been librarified (which incidentally fixed this bug).
Fixes: 88c02b3f79a6 ("s390/sha3: Support sha3 performance enhancements")
Cc: stable(a)vger.kernel.org
Reported-by: Ingo Franzki <ifranzki(a)linux.ibm.com>
Closes: https://lore.kernel.org/r/12740696-595c-4604-873e-aefe8b405fbf@linux.ibm.com
Signed-off-by: Eric Biggers <ebiggers(a)kernel.org>
---
This is targeting 6.16. I'd prefer to take this through
libcrypto-fixes, since the librarification work is also touching this
area. But let me know if there's a preference for the crypto tree or
the s390 tree instead.
arch/s390/crypto/sha1_s390.c | 1 +
arch/s390/crypto/sha512_s390.c | 2 ++
2 files changed, 3 insertions(+)
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
index d229cbd2ba229..73672e76a88f9 100644
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -36,10 +36,11 @@ static int s390_sha1_init(struct shash_desc *desc)
sctx->state[2] = SHA1_H2;
sctx->state[3] = SHA1_H3;
sctx->state[4] = SHA1_H4;
sctx->count = 0;
sctx->func = CPACF_KIMD_SHA_1;
+ sctx->first_message_part = false;
return 0;
}
static int s390_sha1_export(struct shash_desc *desc, void *out)
diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c
index 33711a29618c3..e9e112025ff22 100644
--- a/arch/s390/crypto/sha512_s390.c
+++ b/arch/s390/crypto/sha512_s390.c
@@ -30,10 +30,11 @@ static int sha512_init(struct shash_desc *desc)
ctx->sha512.state[6] = SHA512_H6;
ctx->sha512.state[7] = SHA512_H7;
ctx->count = 0;
ctx->sha512.count_hi = 0;
ctx->func = CPACF_KIMD_SHA_512;
+ ctx->first_message_part = false;
return 0;
}
static int sha512_export(struct shash_desc *desc, void *out)
@@ -95,10 +96,11 @@ static int sha384_init(struct shash_desc *desc)
ctx->sha512.state[6] = SHA384_H6;
ctx->sha512.state[7] = SHA384_H7;
ctx->count = 0;
ctx->sha512.count_hi = 0;
ctx->func = CPACF_KIMD_SHA_512;
+ ctx->first_message_part = false;
return 0;
}
static struct shash_alg sha384_alg = {
base-commit: e540341508ce2f6e27810106253d5de194b66750
--
2.50.0
Restore the len >= 288 condition on using the AVX implementation, which
was incidentally removed by commit 318c53ae02f2 ("crypto: x86/poly1305 -
Add block-only interface"). This check took into account the overhead
in key power computation, kernel-mode "FPU", and tail handling
associated with the AVX code. Indeed, restoring this check slightly
improves performance for len < 256 as measured using poly1305_kunit on
an "AMD Ryzen AI 9 365" (Zen 5) CPU:
Length Before After
====== ========== ==========
1 30 MB/s 36 MB/s
16 516 MB/s 598 MB/s
64 1700 MB/s 1882 MB/s
127 2265 MB/s 2651 MB/s
128 2457 MB/s 2827 MB/s
200 2702 MB/s 3238 MB/s
256 3841 MB/s 3768 MB/s
511 4580 MB/s 4585 MB/s
512 5430 MB/s 5398 MB/s
1024 7268 MB/s 7305 MB/s
3173 8999 MB/s 8948 MB/s
4096 9942 MB/s 9921 MB/s
16384 10557 MB/s 10545 MB/s
While the optimal threshold for this CPU might be slightly lower than
288 (see the len == 256 case), other CPUs would need to be tested too,
and these sorts of benchmarks can underestimate the true cost of
kernel-mode "FPU". Therefore, for now just restore the 288 threshold.
Fixes: 318c53ae02f2 ("crypto: x86/poly1305 - Add block-only interface")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)kernel.org>
---
lib/crypto/x86/poly1305_glue.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c
index 968d84677631..856d48fd422b 100644
--- a/lib/crypto/x86/poly1305_glue.c
+++ b/lib/crypto/x86/poly1305_glue.c
@@ -96,11 +96,19 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp,
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
SZ_4K % POLY1305_BLOCK_SIZE);
+ /*
+ * The AVX implementations have significant setup overhead (e.g. key
+ * power computation, kernel FPU enabling) which makes them slower for
+ * short messages. Fall back to the scalar implementation for messages
+ * shorter than 288 bytes, unless the AVX-specific key setup has already
+ * been performed (indicated by ctx->is_base2_26).
+ */
if (!static_branch_likely(&poly1305_use_avx) ||
+ (len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) ||
unlikely(!irq_fpu_usable())) {
convert_to_base2_64(ctx);
poly1305_blocks_x86_64(ctx, inp, len, padbit);
return;
}
--
2.50.0
Restore the SIMD usability check and base conversion that were removed
by commit 318c53ae02f2 ("crypto: x86/poly1305 - Add block-only
interface").
This safety check is cheap and is well worth eliminating a footgun.
While the Poly1305 functions *should* be called only where SIMD
registers are usable, if they are anyway, they should just do the right
thing instead of corrupting random tasks' registers and/or computing
incorrect MACs. Fixing this is also needed for poly1305_kunit to pass.
Just use irq_fpu_usable() instead of the original crypto_simd_usable(),
since poly1305_kunit won't rely on crypto_simd_disabled_for_test.
Fixes: 318c53ae02f2 ("crypto: x86/poly1305 - Add block-only interface")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)kernel.org>
---
lib/crypto/x86/poly1305_glue.c | 40 +++++++++++++++++++++++++++++++++-
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c
index b7e78a583e07..968d84677631 100644
--- a/lib/crypto/x86/poly1305_glue.c
+++ b/lib/crypto/x86/poly1305_glue.c
@@ -23,10 +23,46 @@ struct poly1305_arch_internal {
u64 r[2];
u64 pad;
struct { u32 r2, r1, r4, r3; } rn[9];
};
+/*
+ * The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
+ */
+static void convert_to_base2_64(void *ctx)
+{
+ struct poly1305_arch_internal *state = ctx;
+ u32 cy;
+
+ if (!state->is_base2_26)
+ return;
+
+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+ state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+ state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+ state->hs[2] = state->h[4] >> 24;
+ /* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+ state->hs[2] &= 3;
+ state->hs[0] += cy;
+ state->hs[1] += (cy = ULT(state->hs[0], cy));
+ state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+ state->is_base2_26 = 0;
+}
+
asmlinkage void poly1305_block_init_arch(
struct poly1305_block_state *state,
const u8 raw_key[POLY1305_BLOCK_SIZE]);
EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
@@ -60,11 +96,13 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp,
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
SZ_4K % POLY1305_BLOCK_SIZE);
- if (!static_branch_likely(&poly1305_use_avx)) {
+ if (!static_branch_likely(&poly1305_use_avx) ||
+ unlikely(!irq_fpu_usable())) {
+ convert_to_base2_64(ctx);
poly1305_blocks_x86_64(ctx, inp, len, padbit);
return;
}
do {
--
2.50.0