8#error "This implementation only supports little-endian ARM."
22 memcpy(dest, &src, 16);
26 return vaddq_u32(a, b);
30 return veorq_u32(a, b);
37 return vld1q_u32(array);
41 return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
45 return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
49 return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
53 return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
184 uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
185 uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
189 vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
191 vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
193 vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
195 vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
199 size_t block_offset, uint32x4_t out[16]) {
200 out[0] =
loadu_128(&inputs[0][block_offset + 0 *
sizeof(uint32x4_t)]);
201 out[1] =
loadu_128(&inputs[1][block_offset + 0 *
sizeof(uint32x4_t)]);
202 out[2] =
loadu_128(&inputs[2][block_offset + 0 *
sizeof(uint32x4_t)]);
203 out[3] =
loadu_128(&inputs[3][block_offset + 0 *
sizeof(uint32x4_t)]);
204 out[4] =
loadu_128(&inputs[0][block_offset + 1 *
sizeof(uint32x4_t)]);
205 out[5] =
loadu_128(&inputs[1][block_offset + 1 *
sizeof(uint32x4_t)]);
206 out[6] =
loadu_128(&inputs[2][block_offset + 1 *
sizeof(uint32x4_t)]);
207 out[7] =
loadu_128(&inputs[3][block_offset + 1 *
sizeof(uint32x4_t)]);
208 out[8] =
loadu_128(&inputs[0][block_offset + 2 *
sizeof(uint32x4_t)]);
209 out[9] =
loadu_128(&inputs[1][block_offset + 2 *
sizeof(uint32x4_t)]);
210 out[10] =
loadu_128(&inputs[2][block_offset + 2 *
sizeof(uint32x4_t)]);
211 out[11] =
loadu_128(&inputs[3][block_offset + 2 *
sizeof(uint32x4_t)]);
212 out[12] =
loadu_128(&inputs[0][block_offset + 3 *
sizeof(uint32x4_t)]);
213 out[13] =
loadu_128(&inputs[1][block_offset + 3 *
sizeof(uint32x4_t)]);
214 out[14] =
loadu_128(&inputs[2][block_offset + 3 *
sizeof(uint32x4_t)]);
215 out[15] =
loadu_128(&inputs[3][block_offset + 3 *
sizeof(uint32x4_t)]);
223 uint32x4_t *out_low, uint32x4_t *out_high) {
234void blake3_hash4_neon(
const uint8_t *
const *inputs,
size_t blocks,
236 bool increment_counter,
uint8_t flags,
238 uint32x4_t h_vecs[8] = {
242 uint32x4_t counter_low_vec, counter_high_vec;
245 uint8_t block_flags = flags | flags_start;
249 block_flags |= flags_end;
252 uint32x4_t block_flags_vec =
set1_128(block_flags);
253 uint32x4_t msg_vecs[16];
257 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
258 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
260 counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
269 h_vecs[0] =
xor_128(v[0], v[8]);
270 h_vecs[1] =
xor_128(v[1], v[9]);
271 h_vecs[2] =
xor_128(v[2], v[10]);
272 h_vecs[3] =
xor_128(v[3], v[11]);
273 h_vecs[4] =
xor_128(v[4], v[12]);
274 h_vecs[5] =
xor_128(v[5], v[13]);
275 h_vecs[6] =
xor_128(v[6], v[14]);
276 h_vecs[7] =
xor_128(v[7], v[15]);
285 storeu_128(h_vecs[0], &out[0 *
sizeof(uint32x4_t)]);
286 storeu_128(h_vecs[4], &out[1 *
sizeof(uint32x4_t)]);
287 storeu_128(h_vecs[1], &out[2 *
sizeof(uint32x4_t)]);
288 storeu_128(h_vecs[5], &out[3 *
sizeof(uint32x4_t)]);
289 storeu_128(h_vecs[2], &out[4 *
sizeof(uint32x4_t)]);
290 storeu_128(h_vecs[6], &out[5 *
sizeof(uint32x4_t)]);
291 storeu_128(h_vecs[3], &out[6 *
sizeof(uint32x4_t)]);
292 storeu_128(h_vecs[7], &out[7 *
sizeof(uint32x4_t)]);
312 uint8_t block_flags = flags | flags_start;
315 block_flags |= flags_end;
331 uint64_t counter,
bool increment_counter,
334 while (num_inputs >= 4) {
335 blake3_hash4_neon(inputs,
blocks, key, counter, increment_counter, flags,
336 flags_start, flags_end, out);
337 if (increment_counter) {
344 while (num_inputs > 0) {
345 hash_one_neon(inputs[0],
blocks, key, counter, flags, flags_start,
347 if (increment_counter) {
bbsections Prepares for basic block by splitting functions into clusters of basic blocks
unify loop Fixup each natural loop to have a single exit block
INLINE __m128i rot16_128(__m128i x)
INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
INLINE void storeu_128(__m128i src, uint8_t dest[16])
INLINE __m128i set1_128(uint32_t x)
INLINE void transpose_vecs_128(__m128i vecs[4])
INLINE __m128i rot8_128(__m128i x)
INLINE __m128i loadu_128(const uint8_t src[16])
INLINE __m128i rot12_128(__m128i x)
INLINE __m128i add_128(__m128i a, __m128i b)
INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, size_t block_offset, __m128i out[16])
INLINE void load_counters4(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi)
INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r)
INLINE __m128i rot7_128(__m128i x)
INLINE __m128i xor_128(__m128i a, __m128i b)
static const uint8_t MSG_SCHEDULE[7][16]
static const uint32_t IV[8]
INLINE uint32_t counter_high(uint64_t counter)
INLINE uint32_t counter_low(uint64_t counter)
#define blake3_hash_many_neon
#define blake3_compress_in_place_portable
auto mask(ShuffFunc S, unsigned Length, OptArgs... args) -> MaskT