Starting with clang-9, I get a warning about the crypto/blake2b_generic.c implementation of the Linux kernel when building for armv7-m: This does not happen with clang-8, or when targeting any other ARM version (v7-a, v5, ...) or any other architecture I tried. Also, using "-Os" optimization instead of "-O2" avoids the problem, using only the usual 480 bytes of stack. Here is a reduced test case: typedef unsigned char u8; typedef unsigned int u32; typedef unsigned long long u64; static inline u64 ror64(u64 word, unsigned int shift) { return (word >> (shift & 63)) | (word << ((-shift) & 63)); } struct blake2b_state { u64 h[8]; }; static const u64 blake2b_IV[8] = { 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL }; static const u8 blake2b_sigma[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } }; #define g(r,i,a,b,c,d) \ do { \ a = a + b + m[blake2b_sigma[r][2*i+0]]; \ d = ror64(d ^ a, 32); \ c = c + d; \ b = ror64(b ^ c, 24); \ d = ror64(d ^ a, 16); \ b = ror64(b ^ c, 63); \ } while (0) #define ROUND(r) \ do { \ g(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ g(r,4,v[ 0],v[ 5],v[10],v[15]); \ g(r,5,v[ 1],v[ 6],v[11],v[12]); \ g(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ g(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ } while (0) void blake2b_compress(struct blake2b_state *S, const u8 block[]) { u64 m[16]; u64 v[16]; unsigned long i; for (i = 0; i < 16; ++i) m[i] = *(u64*)(block + i * sizeof(m[i])); for (i = 0; i < 8; ++i) v[i] = S->h[i]; v[8] = blake2b_IV[0]; v[9] = blake2b_IV[1]; v[10] = blake2b_IV[2]; v[11] = blake2b_IV[3]; ROUND(0); ROUND(1); ROUND(2); ROUND(3); ROUND(4); ROUND(5); ROUND(6); ROUND(7); ROUND(8); ROUND(9); ROUND(10); ROUND(11); for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; } $ clang-11 -O2 -mthumb -Wframe-larger-than=100 --target=arm-linux -msoft-float -march=armv7-a test.c warning: stack frame size of 1200 bytes in function 'blake2b_compress' [-Wframe-larger-than=] See also https://godbolt.org/z/B5jCfH
After having another look at the differences between -Os and -O2 I noticed that the latter performs more loop unrolling. The code is written to assume no loops are unrolled, and forcing the final loop to not be unrolled using "#pragma nounroll" works around the issue.
It looks like this was regressed by https://github.com/llvm/llvm-project/commit/d2d0f46cd2ae0684afa58d73e5effc1a197481d4 according to my bisect.