The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to stable@vger.kernel.org.
This shouldn't have been picked up for -stable: the optimized SHA3 C
code was only merged recently, so there are no stable trees to which
this fix applies.
--
Ard.
> ------------------ original commit in Linus's tree ------------------
>
> From 4767b9ad7d762876a5865a06465e13e139a01b6b Mon Sep 17 00:00:00 2001
> From: Ard Biesheuvel
ard.biesheuvel@linaro.org
> Date: Sat, 27 Jan 2018 09:18:32 +0000
> Subject: [PATCH] crypto: sha3-generic - deal with oversize stack frames
>
> As reported by kbuild test robot, the optimized SHA3 C implementation
> compiles to mn10300 code that uses a disproportionate amount of stack
> space, i.e.,
>
> crypto/sha3_generic.c: In function 'keccakf':
> crypto/sha3_generic.c:147:1: warning: the frame size of 1232 bytes is larger than 1024 bytes [-Wframe-larger-than=]
>
> As kindly diagnosed by Arnd, this does not only occur when building for
> the mn10300 architecture (which is what the report was about) but also
> for h8300, and builds for other 32-bit architectures show an increase in
> stack space utilization as well.
>
> Given that SHA3 operates on 64-bit quantities, and keeps a state matrix
> of 25 64-bit words, it is not surprising that 32-bit architectures with
> few general purpose registers are impacted the most by this, and it is
> therefore reasonable to implement a workaround that distinguishes between
> 32-bit and 64-bit architectures.
>
> Arnd figured out that taking the round calculation out of the loop, and
> inlining it explicitly but only on 64-bit architectures preserves most
> of the performance gain achieved by the rewrite, and also gets rid of
> the excessive use of stack space.
>
> Reported-by: kbuild test robot
fengguang.wu@intel.com
> Suggested-by: Arnd Bergmann
arnd@arndb.de
> Signed-off-by: Ard Biesheuvel
ard.biesheuvel@linaro.org
> Signed-off-by: Herbert Xu
herbert@gondor.apana.org.au
>
> diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c
> index a965b9d80559..951c4eb70262 100644
> --- a/crypto/sha3_generic.c
> +++ b/crypto/sha3_generic.c
> @@ -20,6 +20,20 @@
> #include <crypto/sha3.h>
> #include <asm/unaligned.h>
>
> +/*
> + * On some 32-bit architectures (mn10300 and h8300), GCC ends up using
> + * over 1 KB of stack if we inline the round calculation into the loop
> + * in keccakf(). On the other hand, on 64-bit architectures with plenty
> + * of [64-bit wide] general purpose registers, not inlining it severely
> + * hurts performance. So let's use 64-bitness as a heuristic to decide
> + * whether to inline or not.
> + */
> +#ifdef CONFIG_64BIT
> +#define SHA3_INLINE inline
> +#else
> +#define SHA3_INLINE noinline
> +#endif
> +
> #define KECCAK_ROUNDS 24
>
> static const u64 keccakf_rndc[24] = {
> @@ -35,111 +49,115 @@ static const u64 keccakf_rndc[24] = {
>
> /* update the state with given number of rounds */
>
> -static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
> +static SHA3_INLINE void keccakf_round(u64 st[25])
> {
> u64 t[5], tt, bc[5];
> - int round;
>
> - for (round = 0; round < KECCAK_ROUNDS; round++) {
> + /* Theta */
> + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
> + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
> + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
> + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
> + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
> +
> + t[0] = bc[4] ^ rol64(bc[1], 1);
> + t[1] = bc[0] ^ rol64(bc[2], 1);
> + t[2] = bc[1] ^ rol64(bc[3], 1);
> + t[3] = bc[2] ^ rol64(bc[4], 1);
> + t[4] = bc[3] ^ rol64(bc[0], 1);
> +
> + st[0] ^= t[0];
> +
> + /* Rho Pi */
> + tt = st[1];
> + st[ 1] = rol64(st[ 6] ^ t[1], 44);
> + st[ 6] = rol64(st[ 9] ^ t[4], 20);
> + st[ 9] = rol64(st[22] ^ t[2], 61);
> + st[22] = rol64(st[14] ^ t[4], 39);
> + st[14] = rol64(st[20] ^ t[0], 18);
> + st[20] = rol64(st[ 2] ^ t[2], 62);
> + st[ 2] = rol64(st[12] ^ t[2], 43);
> + st[12] = rol64(st[13] ^ t[3], 25);
> + st[13] = rol64(st[19] ^ t[4], 8);
> + st[19] = rol64(st[23] ^ t[3], 56);
> + st[23] = rol64(st[15] ^ t[0], 41);
> + st[15] = rol64(st[ 4] ^ t[4], 27);
> + st[ 4] = rol64(st[24] ^ t[4], 14);
> + st[24] = rol64(st[21] ^ t[1], 2);
> + st[21] = rol64(st[ 8] ^ t[3], 55);
> + st[ 8] = rol64(st[16] ^ t[1], 45);
> + st[16] = rol64(st[ 5] ^ t[0], 36);
> + st[ 5] = rol64(st[ 3] ^ t[3], 28);
> + st[ 3] = rol64(st[18] ^ t[3], 21);
> + st[18] = rol64(st[17] ^ t[2], 15);
> + st[17] = rol64(st[11] ^ t[1], 10);
> + st[11] = rol64(st[ 7] ^ t[2], 6);
> + st[ 7] = rol64(st[10] ^ t[0], 3);
> + st[10] = rol64( tt ^ t[1], 1);
> +
> + /* Chi */
> + bc[ 0] = ~st[ 1] & st[ 2];
> + bc[ 1] = ~st[ 2] & st[ 3];
> + bc[ 2] = ~st[ 3] & st[ 4];
> + bc[ 3] = ~st[ 4] & st[ 0];
> + bc[ 4] = ~st[ 0] & st[ 1];
> + st[ 0] ^= bc[ 0];
> + st[ 1] ^= bc[ 1];
> + st[ 2] ^= bc[ 2];
> + st[ 3] ^= bc[ 3];
> + st[ 4] ^= bc[ 4];
> +
> + bc[ 0] = ~st[ 6] & st[ 7];
> + bc[ 1] = ~st[ 7] & st[ 8];
> + bc[ 2] = ~st[ 8] & st[ 9];
> + bc[ 3] = ~st[ 9] & st[ 5];
> + bc[ 4] = ~st[ 5] & st[ 6];
> + st[ 5] ^= bc[ 0];
> + st[ 6] ^= bc[ 1];
> + st[ 7] ^= bc[ 2];
> + st[ 8] ^= bc[ 3];
> + st[ 9] ^= bc[ 4];
> +
> + bc[ 0] = ~st[11] & st[12];
> + bc[ 1] = ~st[12] & st[13];
> + bc[ 2] = ~st[13] & st[14];
> + bc[ 3] = ~st[14] & st[10];
> + bc[ 4] = ~st[10] & st[11];
> + st[10] ^= bc[ 0];
> + st[11] ^= bc[ 1];
> + st[12] ^= bc[ 2];
> + st[13] ^= bc[ 3];
> + st[14] ^= bc[ 4];
> +
> + bc[ 0] = ~st[16] & st[17];
> + bc[ 1] = ~st[17] & st[18];
> + bc[ 2] = ~st[18] & st[19];
> + bc[ 3] = ~st[19] & st[15];
> + bc[ 4] = ~st[15] & st[16];
> + st[15] ^= bc[ 0];
> + st[16] ^= bc[ 1];
> + st[17] ^= bc[ 2];
> + st[18] ^= bc[ 3];
> + st[19] ^= bc[ 4];
> +
> + bc[ 0] = ~st[21] & st[22];
> + bc[ 1] = ~st[22] & st[23];
> + bc[ 2] = ~st[23] & st[24];
> + bc[ 3] = ~st[24] & st[20];
> + bc[ 4] = ~st[20] & st[21];
> + st[20] ^= bc[ 0];
> + st[21] ^= bc[ 1];
> + st[22] ^= bc[ 2];
> + st[23] ^= bc[ 3];
> + st[24] ^= bc[ 4];
> +}
>
> - /* Theta */
> - bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
> - bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
> - bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
> - bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
> - bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
> -
> - t[0] = bc[4] ^ rol64(bc[1], 1);
> - t[1] = bc[0] ^ rol64(bc[2], 1);
> - t[2] = bc[1] ^ rol64(bc[3], 1);
> - t[3] = bc[2] ^ rol64(bc[4], 1);
> - t[4] = bc[3] ^ rol64(bc[0], 1);
> -
> - st[0] ^= t[0];
> -
> - /* Rho Pi */
> - tt = st[1];
> - st[ 1] = rol64(st[ 6] ^ t[1], 44);
> - st[ 6] = rol64(st[ 9] ^ t[4], 20);
> - st[ 9] = rol64(st[22] ^ t[2], 61);
> - st[22] = rol64(st[14] ^ t[4], 39);
> - st[14] = rol64(st[20] ^ t[0], 18);
> - st[20] = rol64(st[ 2] ^ t[2], 62);
> - st[ 2] = rol64(st[12] ^ t[2], 43);
> - st[12] = rol64(st[13] ^ t[3], 25);
> - st[13] = rol64(st[19] ^ t[4], 8);
> - st[19] = rol64(st[23] ^ t[3], 56);
> - st[23] = rol64(st[15] ^ t[0], 41);
> - st[15] = rol64(st[ 4] ^ t[4], 27);
> - st[ 4] = rol64(st[24] ^ t[4], 14);
> - st[24] = rol64(st[21] ^ t[1], 2);
> - st[21] = rol64(st[ 8] ^ t[3], 55);
> - st[ 8] = rol64(st[16] ^ t[1], 45);
> - st[16] = rol64(st[ 5] ^ t[0], 36);
> - st[ 5] = rol64(st[ 3] ^ t[3], 28);
> - st[ 3] = rol64(st[18] ^ t[3], 21);
> - st[18] = rol64(st[17] ^ t[2], 15);
> - st[17] = rol64(st[11] ^ t[1], 10);
> - st[11] = rol64(st[ 7] ^ t[2], 6);
> - st[ 7] = rol64(st[10] ^ t[0], 3);
> - st[10] = rol64( tt ^ t[1], 1);
> -
> - /* Chi */
> - bc[ 0] = ~st[ 1] & st[ 2];
> - bc[ 1] = ~st[ 2] & st[ 3];
> - bc[ 2] = ~st[ 3] & st[ 4];
> - bc[ 3] = ~st[ 4] & st[ 0];
> - bc[ 4] = ~st[ 0] & st[ 1];
> - st[ 0] ^= bc[ 0];
> - st[ 1] ^= bc[ 1];
> - st[ 2] ^= bc[ 2];
> - st[ 3] ^= bc[ 3];
> - st[ 4] ^= bc[ 4];
> -
> - bc[ 0] = ~st[ 6] & st[ 7];
> - bc[ 1] = ~st[ 7] & st[ 8];
> - bc[ 2] = ~st[ 8] & st[ 9];
> - bc[ 3] = ~st[ 9] & st[ 5];
> - bc[ 4] = ~st[ 5] & st[ 6];
> - st[ 5] ^= bc[ 0];
> - st[ 6] ^= bc[ 1];
> - st[ 7] ^= bc[ 2];
> - st[ 8] ^= bc[ 3];
> - st[ 9] ^= bc[ 4];
> -
> - bc[ 0] = ~st[11] & st[12];
> - bc[ 1] = ~st[12] & st[13];
> - bc[ 2] = ~st[13] & st[14];
> - bc[ 3] = ~st[14] & st[10];
> - bc[ 4] = ~st[10] & st[11];
> - st[10] ^= bc[ 0];
> - st[11] ^= bc[ 1];
> - st[12] ^= bc[ 2];
> - st[13] ^= bc[ 3];
> - st[14] ^= bc[ 4];
> -
> - bc[ 0] = ~st[16] & st[17];
> - bc[ 1] = ~st[17] & st[18];
> - bc[ 2] = ~st[18] & st[19];
> - bc[ 3] = ~st[19] & st[15];
> - bc[ 4] = ~st[15] & st[16];
> - st[15] ^= bc[ 0];
> - st[16] ^= bc[ 1];
> - st[17] ^= bc[ 2];
> - st[18] ^= bc[ 3];
> - st[19] ^= bc[ 4];
> -
> - bc[ 0] = ~st[21] & st[22];
> - bc[ 1] = ~st[22] & st[23];
> - bc[ 2] = ~st[23] & st[24];
> - bc[ 3] = ~st[24] & st[20];
> - bc[ 4] = ~st[20] & st[21];
> - st[20] ^= bc[ 0];
> - st[21] ^= bc[ 1];
> - st[22] ^= bc[ 2];
> - st[23] ^= bc[ 3];
> - st[24] ^= bc[ 4];
> +static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
> +{
> + int round;
>
> + for (round = 0; round < KECCAK_ROUNDS; round++) {
> + keccakf_round(st);
> /* Iota */
> st[0] ^= keccakf_rndc[round];
> }
>