8#error "This implementation only supports little-endian ARM."
15 return vreinterpretq_u32_u8(vld1q_u8(src));
20 vst1q_u8(dest, vreinterpretq_u8_u32(src));
24 return vaddq_u32(a, b);
28 return veorq_u32(a, b);
35 return vld1q_u32(array);
43 return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
49 return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
56 return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
57#elif defined(__GNUC__)
58 static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
59 return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
61 return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
68 return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
199 uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
200 uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
204 vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
206 vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
208 vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
210 vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
214 size_t block_offset, uint32x4_t out[16]) {
215 out[0] =
loadu_128(&inputs[0][block_offset + 0 *
sizeof(uint32x4_t)]);
216 out[1] =
loadu_128(&inputs[1][block_offset + 0 *
sizeof(uint32x4_t)]);
217 out[2] =
loadu_128(&inputs[2][block_offset + 0 *
sizeof(uint32x4_t)]);
218 out[3] =
loadu_128(&inputs[3][block_offset + 0 *
sizeof(uint32x4_t)]);
219 out[4] =
loadu_128(&inputs[0][block_offset + 1 *
sizeof(uint32x4_t)]);
220 out[5] =
loadu_128(&inputs[1][block_offset + 1 *
sizeof(uint32x4_t)]);
221 out[6] =
loadu_128(&inputs[2][block_offset + 1 *
sizeof(uint32x4_t)]);
222 out[7] =
loadu_128(&inputs[3][block_offset + 1 *
sizeof(uint32x4_t)]);
223 out[8] =
loadu_128(&inputs[0][block_offset + 2 *
sizeof(uint32x4_t)]);
224 out[9] =
loadu_128(&inputs[1][block_offset + 2 *
sizeof(uint32x4_t)]);
225 out[10] =
loadu_128(&inputs[2][block_offset + 2 *
sizeof(uint32x4_t)]);
226 out[11] =
loadu_128(&inputs[3][block_offset + 2 *
sizeof(uint32x4_t)]);
227 out[12] =
loadu_128(&inputs[0][block_offset + 3 *
sizeof(uint32x4_t)]);
228 out[13] =
loadu_128(&inputs[1][block_offset + 3 *
sizeof(uint32x4_t)]);
229 out[14] =
loadu_128(&inputs[2][block_offset + 3 *
sizeof(uint32x4_t)]);
230 out[15] =
loadu_128(&inputs[3][block_offset + 3 *
sizeof(uint32x4_t)]);
238 uint32x4_t *out_low, uint32x4_t *out_high) {
248static void blake3_hash4_neon(
const uint8_t *
const *inputs,
size_t blocks,
250 bool increment_counter,
uint8_t flags,
253 uint32x4_t h_vecs[8] = {
257 uint32x4_t counter_low_vec, counter_high_vec;
260 uint8_t block_flags = flags | flags_start;
264 block_flags |= flags_end;
267 uint32x4_t block_flags_vec =
set1_128(block_flags);
268 uint32x4_t msg_vecs[16];
272 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
273 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
275 counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
284 h_vecs[0] =
xor_128(v[0], v[8]);
285 h_vecs[1] =
xor_128(v[1], v[9]);
286 h_vecs[2] =
xor_128(v[2], v[10]);
287 h_vecs[3] =
xor_128(v[3], v[11]);
288 h_vecs[4] =
xor_128(v[4], v[12]);
289 h_vecs[5] =
xor_128(v[5], v[13]);
290 h_vecs[6] =
xor_128(v[6], v[14]);
291 h_vecs[7] =
xor_128(v[7], v[15]);
300 storeu_128(h_vecs[0], &out[0 *
sizeof(uint32x4_t)]);
301 storeu_128(h_vecs[4], &out[1 *
sizeof(uint32x4_t)]);
302 storeu_128(h_vecs[1], &out[2 *
sizeof(uint32x4_t)]);
303 storeu_128(h_vecs[5], &out[3 *
sizeof(uint32x4_t)]);
304 storeu_128(h_vecs[2], &out[4 *
sizeof(uint32x4_t)]);
305 storeu_128(h_vecs[6], &out[5 *
sizeof(uint32x4_t)]);
306 storeu_128(h_vecs[3], &out[6 *
sizeof(uint32x4_t)]);
307 storeu_128(h_vecs[7], &out[7 *
sizeof(uint32x4_t)]);
327 uint8_t block_flags = flags | flags_start;
330 block_flags |= flags_end;
346 uint64_t counter,
bool increment_counter,
349 while (num_inputs >= 4) {
350 blake3_hash4_neon(inputs,
blocks, key, counter, increment_counter, flags,
351 flags_start, flags_end, out);
352 if (increment_counter) {
359 while (num_inputs > 0) {
360 hash_one_neon(inputs[0],
blocks, key, counter, flags, flags_start,
362 if (increment_counter) {
bbsections Prepares for basic block by splitting functions into clusters of basic blocks
unify loop Fixup each natural loop to have a single exit block
INLINE __m128i rot16_128(__m128i x)
INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
INLINE void storeu_128(__m128i src, uint8_t dest[16])
INLINE __m128i set1_128(uint32_t x)
INLINE void transpose_vecs_128(__m128i vecs[4])
INLINE __m128i rot8_128(__m128i x)
INLINE __m128i loadu_128(const uint8_t src[16])
INLINE __m128i rot12_128(__m128i x)
INLINE __m128i add_128(__m128i a, __m128i b)
INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, size_t block_offset, __m128i out[16])
INLINE void load_counters4(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi)
INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r)
INLINE __m128i rot7_128(__m128i x)
INLINE __m128i xor_128(__m128i a, __m128i b)
static const uint8_t MSG_SCHEDULE[7][16]
static const uint32_t IV[8]
INLINE uint32_t counter_high(uint64_t counter)
INLINE uint32_t counter_low(uint64_t counter)
#define blake3_hash_many_neon
#define blake3_compress_in_place_portable
auto mask(ShuffFunc S, unsigned Length, OptArgs... args) -> MaskT