5#define _mm_shuffle_ps2(a, b, c) \
7 _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
10 return _mm_loadu_si128((
const __m128i *)src);
14 return _mm256_loadu_si256((
const __m256i *)src);
18 return _mm512_loadu_si512((
const __m512i *)src);
22 _mm_storeu_si128((__m128i *)dest, src);
26 _mm256_storeu_si256((__m256i *)dest, src);
29INLINE __m128i
add_128(__m128i a, __m128i b) {
return _mm_add_epi32(a, b); }
31INLINE __m256i
add_256(__m256i a, __m256i b) {
return _mm256_add_epi32(a, b); }
33INLINE __m512i
add_512(__m512i a, __m512i b) {
return _mm512_add_epi32(a, b); }
35INLINE __m128i
xor_128(__m128i a, __m128i b) {
return _mm_xor_si128(a, b); }
37INLINE __m256i
xor_256(__m256i a, __m256i b) {
return _mm256_xor_si256(a, b); }
39INLINE __m512i
xor_512(__m512i a, __m512i b) {
return _mm512_xor_si512(a, b); }
48 return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
81INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
91INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
105 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
106 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
107 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
111 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
112 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
113 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
118 uint8_t block_len,
uint64_t counter, uint8_t flags) {
130 __m128i t0, t1, t2, t3, tt;
135 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
137 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
140 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
141 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
143 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
144 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
154 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
155 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
157 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
158 t1 = _mm_blend_epi16(tt, t1, 0xCC);
159 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
161 t2 = _mm_unpacklo_epi64(m3, m1);
162 tt = _mm_blend_epi16(t2, m2, 0xC0);
163 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
164 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
165 t3 = _mm_unpackhi_epi32(m1, m3);
166 tt = _mm_unpacklo_epi32(m2, t3);
167 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
168 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
177 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
178 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
180 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
181 t1 = _mm_blend_epi16(tt, t1, 0xCC);
182 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
184 t2 = _mm_unpacklo_epi64(m3, m1);
185 tt = _mm_blend_epi16(t2, m2, 0xC0);
186 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
187 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
188 t3 = _mm_unpackhi_epi32(m1, m3);
189 tt = _mm_unpacklo_epi32(m2, t3);
190 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
191 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
200 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
201 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
203 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
204 t1 = _mm_blend_epi16(tt, t1, 0xCC);
205 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
207 t2 = _mm_unpacklo_epi64(m3, m1);
208 tt = _mm_blend_epi16(t2, m2, 0xC0);
209 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
210 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
211 t3 = _mm_unpackhi_epi32(m1, m3);
212 tt = _mm_unpacklo_epi32(m2, t3);
213 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
214 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
223 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
224 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
226 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
227 t1 = _mm_blend_epi16(tt, t1, 0xCC);
228 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
230 t2 = _mm_unpacklo_epi64(m3, m1);
231 tt = _mm_blend_epi16(t2, m2, 0xC0);
232 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
233 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
234 t3 = _mm_unpackhi_epi32(m1, m3);
235 tt = _mm_unpacklo_epi32(m2, t3);
236 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
237 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
246 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
247 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
249 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
250 t1 = _mm_blend_epi16(tt, t1, 0xCC);
251 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
253 t2 = _mm_unpacklo_epi64(m3, m1);
254 tt = _mm_blend_epi16(t2, m2, 0xC0);
255 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
256 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
257 t3 = _mm_unpackhi_epi32(m1, m3);
258 tt = _mm_unpacklo_epi32(m2, t3);
259 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
260 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
269 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
270 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
272 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
273 t1 = _mm_blend_epi16(tt, t1, 0xCC);
274 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
276 t2 = _mm_unpacklo_epi64(m3, m1);
277 tt = _mm_blend_epi16(t2, m2, 0xC0);
278 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
279 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
280 t3 = _mm_unpackhi_epi32(m1, m3);
281 tt = _mm_unpacklo_epi32(m2, t3);
282 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
283 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
289 uint8_t block_len,
uint64_t counter,
290 uint8_t flags, uint8_t out[64]) {
301 uint8_t block_len,
uint64_t counter,
435 __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
436 __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
437 __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
438 __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
441 __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
442 __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
443 __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
444 __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
453 size_t block_offset, __m128i out[16]) {
454 out[0] =
loadu_128(&inputs[0][block_offset + 0 *
sizeof(__m128i)]);
455 out[1] =
loadu_128(&inputs[1][block_offset + 0 *
sizeof(__m128i)]);
456 out[2] =
loadu_128(&inputs[2][block_offset + 0 *
sizeof(__m128i)]);
457 out[3] =
loadu_128(&inputs[3][block_offset + 0 *
sizeof(__m128i)]);
458 out[4] =
loadu_128(&inputs[0][block_offset + 1 *
sizeof(__m128i)]);
459 out[5] =
loadu_128(&inputs[1][block_offset + 1 *
sizeof(__m128i)]);
460 out[6] =
loadu_128(&inputs[2][block_offset + 1 *
sizeof(__m128i)]);
461 out[7] =
loadu_128(&inputs[3][block_offset + 1 *
sizeof(__m128i)]);
462 out[8] =
loadu_128(&inputs[0][block_offset + 2 *
sizeof(__m128i)]);
463 out[9] =
loadu_128(&inputs[1][block_offset + 2 *
sizeof(__m128i)]);
464 out[10] =
loadu_128(&inputs[2][block_offset + 2 *
sizeof(__m128i)]);
465 out[11] =
loadu_128(&inputs[3][block_offset + 2 *
sizeof(__m128i)]);
466 out[12] =
loadu_128(&inputs[0][block_offset + 3 *
sizeof(__m128i)]);
467 out[13] =
loadu_128(&inputs[1][block_offset + 3 *
sizeof(__m128i)]);
468 out[14] =
loadu_128(&inputs[2][block_offset + 3 *
sizeof(__m128i)]);
469 out[15] =
loadu_128(&inputs[3][block_offset + 3 *
sizeof(__m128i)]);
470 for (
size_t i = 0; i < 4; ++i) {
471 _mm_prefetch((
const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
480 __m128i *out_lo, __m128i *out_hi) {
481 uint64_t mask = (increment_counter ? ~0 : 0);
482 __m256i mask_vec = _mm256_set1_epi64x(mask);
483 __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
484 deltas = _mm256_and_si256(mask_vec, deltas);
486 _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
487 *out_lo = _mm256_cvtepi64_epi32(counters);
488 *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
494 bool increment_counter, uint8_t flags,
495 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
496 __m128i h_vecs[8] = {
500 __m128i counter_low_vec, counter_high_vec;
503 uint8_t block_flags = flags | flags_start;
507 block_flags |= flags_end;
510 __m128i block_flags_vec =
set1_128(block_flags);
511 __m128i msg_vecs[16];
515 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
516 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
518 counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
527 h_vecs[0] =
xor_128(v[0], v[8]);
528 h_vecs[1] =
xor_128(v[1], v[9]);
529 h_vecs[2] =
xor_128(v[2], v[10]);
530 h_vecs[3] =
xor_128(v[3], v[11]);
531 h_vecs[4] =
xor_128(v[4], v[12]);
532 h_vecs[5] =
xor_128(v[5], v[13]);
533 h_vecs[6] =
xor_128(v[6], v[14]);
534 h_vecs[7] =
xor_128(v[7], v[15]);
543 storeu_128(h_vecs[0], &out[0 *
sizeof(__m128i)]);
544 storeu_128(h_vecs[4], &out[1 *
sizeof(__m128i)]);
545 storeu_128(h_vecs[1], &out[2 *
sizeof(__m128i)]);
546 storeu_128(h_vecs[5], &out[3 *
sizeof(__m128i)]);
547 storeu_128(h_vecs[2], &out[4 *
sizeof(__m128i)]);
548 storeu_128(h_vecs[6], &out[5 *
sizeof(__m128i)]);
549 storeu_128(h_vecs[3], &out[6 *
sizeof(__m128i)]);
550 storeu_128(h_vecs[7], &out[7 *
sizeof(__m128i)]);
678 __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
679 __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
680 __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
681 __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
682 __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
683 __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
684 __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
685 __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
689 __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
690 __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
691 __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
692 __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
693 __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
694 __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
695 __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
696 __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
699 vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
700 vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
701 vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
702 vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
703 vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
704 vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
705 vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
706 vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
710 size_t block_offset, __m256i out[16]) {
711 out[0] =
loadu_256(&inputs[0][block_offset + 0 *
sizeof(__m256i)]);
712 out[1] =
loadu_256(&inputs[1][block_offset + 0 *
sizeof(__m256i)]);
713 out[2] =
loadu_256(&inputs[2][block_offset + 0 *
sizeof(__m256i)]);
714 out[3] =
loadu_256(&inputs[3][block_offset + 0 *
sizeof(__m256i)]);
715 out[4] =
loadu_256(&inputs[4][block_offset + 0 *
sizeof(__m256i)]);
716 out[5] =
loadu_256(&inputs[5][block_offset + 0 *
sizeof(__m256i)]);
717 out[6] =
loadu_256(&inputs[6][block_offset + 0 *
sizeof(__m256i)]);
718 out[7] =
loadu_256(&inputs[7][block_offset + 0 *
sizeof(__m256i)]);
719 out[8] =
loadu_256(&inputs[0][block_offset + 1 *
sizeof(__m256i)]);
720 out[9] =
loadu_256(&inputs[1][block_offset + 1 *
sizeof(__m256i)]);
721 out[10] =
loadu_256(&inputs[2][block_offset + 1 *
sizeof(__m256i)]);
722 out[11] =
loadu_256(&inputs[3][block_offset + 1 *
sizeof(__m256i)]);
723 out[12] =
loadu_256(&inputs[4][block_offset + 1 *
sizeof(__m256i)]);
724 out[13] =
loadu_256(&inputs[5][block_offset + 1 *
sizeof(__m256i)]);
725 out[14] =
loadu_256(&inputs[6][block_offset + 1 *
sizeof(__m256i)]);
726 out[15] =
loadu_256(&inputs[7][block_offset + 1 *
sizeof(__m256i)]);
727 for (
size_t i = 0; i < 8; ++i) {
728 _mm_prefetch((
const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
735 __m256i *out_lo, __m256i *out_hi) {
736 uint64_t mask = (increment_counter ? ~0 : 0);
737 __m512i mask_vec = _mm512_set1_epi64(mask);
738 __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
739 deltas = _mm512_and_si512(mask_vec, deltas);
741 _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
742 *out_lo = _mm512_cvtepi64_epi32(counters);
743 *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
749 bool increment_counter, uint8_t flags,
750 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
751 __m256i h_vecs[8] = {
755 __m256i counter_low_vec, counter_high_vec;
758 uint8_t block_flags = flags | flags_start;
762 block_flags |= flags_end;
765 __m256i block_flags_vec =
set1_256(block_flags);
766 __m256i msg_vecs[16];
770 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
771 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
773 counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
782 h_vecs[0] =
xor_256(v[0], v[8]);
783 h_vecs[1] =
xor_256(v[1], v[9]);
784 h_vecs[2] =
xor_256(v[2], v[10]);
785 h_vecs[3] =
xor_256(v[3], v[11]);
786 h_vecs[4] =
xor_256(v[4], v[12]);
787 h_vecs[5] =
xor_256(v[5], v[13]);
788 h_vecs[6] =
xor_256(v[6], v[14]);
789 h_vecs[7] =
xor_256(v[7], v[15]);
795 storeu_256(h_vecs[0], &out[0 *
sizeof(__m256i)]);
796 storeu_256(h_vecs[1], &out[1 *
sizeof(__m256i)]);
797 storeu_256(h_vecs[2], &out[2 *
sizeof(__m256i)]);
798 storeu_256(h_vecs[3], &out[3 *
sizeof(__m256i)]);
799 storeu_256(h_vecs[4], &out[4 *
sizeof(__m256i)]);
800 storeu_256(h_vecs[5], &out[5 *
sizeof(__m256i)]);
801 storeu_256(h_vecs[6], &out[6 *
sizeof(__m256i)]);
802 storeu_256(h_vecs[7], &out[7 *
sizeof(__m256i)]);
931 return _mm512_shuffle_i32x4(a, b,
LO_IMM8);
938 return _mm512_shuffle_i32x4(a, b,
HI_IMM8);
945 __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
946 __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
947 __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
948 __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
949 __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
950 __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
951 __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
952 __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
953 __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
954 __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
955 __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
956 __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
957 __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
958 __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
959 __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
960 __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
967 __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
968 __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
969 __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
970 __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
971 __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
972 __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
973 __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
974 __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
975 __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
976 __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
977 __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
978 __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
979 __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
980 __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
981 __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
982 __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
1024 size_t block_offset, __m512i out[16]) {
1025 out[0] =
loadu_512(&inputs[0][block_offset]);
1026 out[1] =
loadu_512(&inputs[1][block_offset]);
1027 out[2] =
loadu_512(&inputs[2][block_offset]);
1028 out[3] =
loadu_512(&inputs[3][block_offset]);
1029 out[4] =
loadu_512(&inputs[4][block_offset]);
1030 out[5] =
loadu_512(&inputs[5][block_offset]);
1031 out[6] =
loadu_512(&inputs[6][block_offset]);
1032 out[7] =
loadu_512(&inputs[7][block_offset]);
1033 out[8] =
loadu_512(&inputs[8][block_offset]);
1034 out[9] =
loadu_512(&inputs[9][block_offset]);
1035 out[10] =
loadu_512(&inputs[10][block_offset]);
1036 out[11] =
loadu_512(&inputs[11][block_offset]);
1037 out[12] =
loadu_512(&inputs[12][block_offset]);
1038 out[13] =
loadu_512(&inputs[13][block_offset]);
1039 out[14] =
loadu_512(&inputs[14][block_offset]);
1040 out[15] =
loadu_512(&inputs[15][block_offset]);
1041 for (
size_t i = 0; i < 16; ++i) {
1042 _mm_prefetch((
const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1048 __m512i *out_lo, __m512i *out_hi) {
1049 const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
1050 const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1051 const __m512i add1 = _mm512_and_si512(mask, add0);
1052 __m512i l = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), add1);
1053 __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
1054 __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carry, _mm512_set1_epi32((int32_t)(counter >> 32)), _mm512_set1_epi32(1));
1062 bool increment_counter, uint8_t flags,
1063 uint8_t flags_start, uint8_t flags_end,
1065 __m512i h_vecs[8] = {
1069 __m512i counter_low_vec, counter_high_vec;
1072 uint8_t block_flags = flags | flags_start;
1076 block_flags |= flags_end;
1079 __m512i block_flags_vec =
set1_512(block_flags);
1080 __m512i msg_vecs[16];
1084 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1085 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1087 counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
1096 h_vecs[0] =
xor_512(v[0], v[8]);
1097 h_vecs[1] =
xor_512(v[1], v[9]);
1098 h_vecs[2] =
xor_512(v[2], v[10]);
1099 h_vecs[3] =
xor_512(v[3], v[11]);
1100 h_vecs[4] =
xor_512(v[4], v[12]);
1101 h_vecs[5] =
xor_512(v[5], v[13]);
1102 h_vecs[6] =
xor_512(v[6], v[14]);
1103 h_vecs[7] =
xor_512(v[7], v[15]);
1105 block_flags = flags;
1111 __m512i padded[16] = {
1112 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1113 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1118 _mm256_mask_storeu_epi32(&out[0 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
1119 _mm256_mask_storeu_epi32(&out[1 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
1120 _mm256_mask_storeu_epi32(&out[2 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
1121 _mm256_mask_storeu_epi32(&out[3 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
1122 _mm256_mask_storeu_epi32(&out[4 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
1123 _mm256_mask_storeu_epi32(&out[5 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
1124 _mm256_mask_storeu_epi32(&out[6 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
1125 _mm256_mask_storeu_epi32(&out[7 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
1126 _mm256_mask_storeu_epi32(&out[8 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
1127 _mm256_mask_storeu_epi32(&out[9 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
1128 _mm256_mask_storeu_epi32(&out[10 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
1129 _mm256_mask_storeu_epi32(&out[11 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
1130 _mm256_mask_storeu_epi32(&out[12 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
1131 _mm256_mask_storeu_epi32(&out[13 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
1132 _mm256_mask_storeu_epi32(&out[14 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
1133 _mm256_mask_storeu_epi32(&out[15 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
1144 uint8_t flags, uint8_t flags_start,
1148 uint8_t block_flags = flags | flags_start;
1151 block_flags |= flags_end;
1157 block_flags = flags;
1164 uint64_t counter,
bool increment_counter,
1165 uint8_t flags, uint8_t flags_start,
1166 uint8_t flags_end, uint8_t *out) {
1167 while (num_inputs >= 16) {
1169 flags_start, flags_end, out);
1170 if (increment_counter) {
1177 while (num_inputs >= 8) {
1179 flags_start, flags_end, out);
1180 if (increment_counter) {
1187 while (num_inputs >= 4) {
1189 flags_start, flags_end, out);
1190 if (increment_counter) {
1197 while (num_inputs > 0) {
1200 if (increment_counter) {
bbsections Prepares for basic block by splitting functions into clusters of basic blocks
unify loop Fixup each natural loop to have a single exit block
INLINE __m128i rot16_128(__m128i x)
INLINE __m512i rot8_512(__m512i x)
INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
#define _mm_shuffle_ps2(a, b, c)
INLINE __m256i set1_256(uint32_t x)
INLINE void storeu_128(__m128i src, uint8_t dest[16])
INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
INLINE __m128i set1_128(uint32_t x)
INLINE __m512i set1_512(uint32_t x)
INLINE __m256i rot8_256(__m256i x)
INLINE __m512i loadu_512(const uint8_t src[64])
INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r)
INLINE __m256i rot7_256(__m256i x)
INLINE void storeu_256(__m256i src, uint8_t dest[16])
INLINE __m512i rot16_512(__m512i x)
INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
INLINE void transpose_vecs_128(__m128i vecs[4])
INLINE __m128i rot8_128(__m128i x)
INLINE void transpose_vecs_512(__m512i vecs[16])
INLINE __m512i add_512(__m512i a, __m512i b)
INLINE __m256i xor_256(__m256i a, __m256i b)
INLINE __m128i loadu_128(const uint8_t src[16])
INLINE __m128i rot12_128(__m128i x)
INLINE __m256i rot12_256(__m256i x)
INLINE void load_counters8(uint64_t counter, bool increment_counter, __m256i *out_lo, __m256i *out_hi)
INLINE __m128i add_128(__m128i a, __m128i b)
static void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3)
INLINE void load_counters16(uint64_t counter, bool increment_counter, __m512i *out_lo, __m512i *out_hi)
INLINE __m256i rot16_256(__m256i x)
INLINE __m512i unpack_lo_128(__m512i a, __m512i b)
INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, size_t block_offset, __m128i out[16])
INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, size_t block_offset, __m256i out[16])
INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3)
INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
INLINE void load_counters4(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi)
INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
static void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
INLINE __m256i loadu_256(const uint8_t src[32])
INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r)
INLINE __m512i rot7_512(__m512i x)
INLINE void transpose_vecs_256(__m256i vecs[8])
INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, size_t block_offset, __m512i out[16])
INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r)
INLINE __m512i unpack_hi_128(__m512i a, __m512i b)
INLINE __m256i add_256(__m256i a, __m256i b)
INLINE __m128i rot7_128(__m128i x)
INLINE __m128i xor_128(__m128i a, __m128i b)
INLINE __m512i xor_512(__m512i a, __m512i b)
INLINE __m512i rot12_512(__m512i x)
static void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
static const uint8_t MSG_SCHEDULE[7][16]
static const uint32_t IV[8]
INLINE uint32_t counter_high(uint64_t counter)
INLINE uint32_t counter_low(uint64_t counter)
#define blake3_hash_many_avx512
#define blake3_compress_xof_avx512
#define blake3_compress_in_place_avx512