LLVM  16.0.0git
blake3_sse2.c
Go to the documentation of this file.
1 #include "blake3_impl.h"
2 
3 #include <immintrin.h>
4 
5 #define DEGREE 4
6 
7 #define _mm_shuffle_ps2(a, b, c) \
8  (_mm_castps_si128( \
9  _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
10 
11 INLINE __m128i loadu(const uint8_t src[16]) {
12  return _mm_loadu_si128((const __m128i *)src);
13 }
14 
15 INLINE void storeu(__m128i src, uint8_t dest[16]) {
16  _mm_storeu_si128((__m128i *)dest, src);
17 }
18 
19 INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
20 
21 // Note that clang-format doesn't like the name "xor" for some reason.
22 INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
23 
24 INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
25 
27  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
28 }
29 
30 INLINE __m128i rot16(__m128i x) {
31  return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
32 }
33 
34 INLINE __m128i rot12(__m128i x) {
35  return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
36 }
37 
38 INLINE __m128i rot8(__m128i x) {
39  return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
40 }
41 
42 INLINE __m128i rot7(__m128i x) {
43  return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
44 }
45 
46 INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
47  __m128i m) {
48  *row0 = addv(addv(*row0, m), *row1);
49  *row3 = xorv(*row3, *row0);
50  *row3 = rot16(*row3);
51  *row2 = addv(*row2, *row3);
52  *row1 = xorv(*row1, *row2);
53  *row1 = rot12(*row1);
54 }
55 
56 INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
57  __m128i m) {
58  *row0 = addv(addv(*row0, m), *row1);
59  *row3 = xorv(*row3, *row0);
60  *row3 = rot8(*row3);
61  *row2 = addv(*row2, *row3);
62  *row1 = xorv(*row1, *row2);
63  *row1 = rot7(*row1);
64 }
65 
66 // Note the optimization here of leaving row1 as the unrotated row, rather than
67 // row0. All the message loads below are adjusted to compensate for this. See
68 // discussion at https://github.com/sneves/blake2-avx2/pull/4
69 INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
70  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
71  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
72  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
73 }
74 
75 INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
76  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
77  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
78  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
79 }
80 
81 INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
82  const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
83  __m128i mask = _mm_set1_epi16(imm8);
84  mask = _mm_and_si128(mask, bits);
85  mask = _mm_cmpeq_epi16(mask, bits);
86  return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
87 }
88 
89 INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
90  const uint8_t block[BLAKE3_BLOCK_LEN],
91  uint8_t block_len, uint64_t counter, uint8_t flags) {
92  rows[0] = loadu((uint8_t *)&cv[0]);
93  rows[1] = loadu((uint8_t *)&cv[4]);
94  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
95  rows[3] = set4(counter_low(counter), counter_high(counter),
96  (uint32_t)block_len, (uint32_t)flags);
97 
98  __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
99  __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
100  __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
101  __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
102 
103  __m128i t0, t1, t2, t3, tt;
104 
105  // Round 1. The first round permutes the message words from the original
106  // input order, into the groups that get mixed in parallel.
107  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
108  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
109  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
110  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
111  diagonalize(&rows[0], &rows[2], &rows[3]);
112  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
113  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
114  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
115  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
116  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
117  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
118  undiagonalize(&rows[0], &rows[2], &rows[3]);
119  m0 = t0;
120  m1 = t1;
121  m2 = t2;
122  m3 = t3;
123 
124  // Round 2. This round and all following rounds apply a fixed permutation
125  // to the message words from the round before.
126  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
127  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
128  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
129  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
130  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
131  t1 = blend_epi16(tt, t1, 0xCC);
132  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
133  diagonalize(&rows[0], &rows[2], &rows[3]);
134  t2 = _mm_unpacklo_epi64(m3, m1);
135  tt = blend_epi16(t2, m2, 0xC0);
136  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
137  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
138  t3 = _mm_unpackhi_epi32(m1, m3);
139  tt = _mm_unpacklo_epi32(m2, t3);
140  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
141  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
142  undiagonalize(&rows[0], &rows[2], &rows[3]);
143  m0 = t0;
144  m1 = t1;
145  m2 = t2;
146  m3 = t3;
147 
148  // Round 3
149  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
150  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
151  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
152  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
153  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
154  t1 = blend_epi16(tt, t1, 0xCC);
155  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
156  diagonalize(&rows[0], &rows[2], &rows[3]);
157  t2 = _mm_unpacklo_epi64(m3, m1);
158  tt = blend_epi16(t2, m2, 0xC0);
159  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
160  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
161  t3 = _mm_unpackhi_epi32(m1, m3);
162  tt = _mm_unpacklo_epi32(m2, t3);
163  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
164  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
165  undiagonalize(&rows[0], &rows[2], &rows[3]);
166  m0 = t0;
167  m1 = t1;
168  m2 = t2;
169  m3 = t3;
170 
171  // Round 4
172  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
173  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
174  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
175  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
176  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
177  t1 = blend_epi16(tt, t1, 0xCC);
178  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
179  diagonalize(&rows[0], &rows[2], &rows[3]);
180  t2 = _mm_unpacklo_epi64(m3, m1);
181  tt = blend_epi16(t2, m2, 0xC0);
182  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
183  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
184  t3 = _mm_unpackhi_epi32(m1, m3);
185  tt = _mm_unpacklo_epi32(m2, t3);
186  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
187  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
188  undiagonalize(&rows[0], &rows[2], &rows[3]);
189  m0 = t0;
190  m1 = t1;
191  m2 = t2;
192  m3 = t3;
193 
194  // Round 5
195  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
196  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
197  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
198  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
199  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
200  t1 = blend_epi16(tt, t1, 0xCC);
201  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
202  diagonalize(&rows[0], &rows[2], &rows[3]);
203  t2 = _mm_unpacklo_epi64(m3, m1);
204  tt = blend_epi16(t2, m2, 0xC0);
205  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
206  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
207  t3 = _mm_unpackhi_epi32(m1, m3);
208  tt = _mm_unpacklo_epi32(m2, t3);
209  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
210  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
211  undiagonalize(&rows[0], &rows[2], &rows[3]);
212  m0 = t0;
213  m1 = t1;
214  m2 = t2;
215  m3 = t3;
216 
217  // Round 6
218  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
219  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
220  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
221  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
222  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
223  t1 = blend_epi16(tt, t1, 0xCC);
224  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
225  diagonalize(&rows[0], &rows[2], &rows[3]);
226  t2 = _mm_unpacklo_epi64(m3, m1);
227  tt = blend_epi16(t2, m2, 0xC0);
228  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
229  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
230  t3 = _mm_unpackhi_epi32(m1, m3);
231  tt = _mm_unpacklo_epi32(m2, t3);
232  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
233  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
234  undiagonalize(&rows[0], &rows[2], &rows[3]);
235  m0 = t0;
236  m1 = t1;
237  m2 = t2;
238  m3 = t3;
239 
240  // Round 7
241  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
242  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
243  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
244  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
245  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
246  t1 = blend_epi16(tt, t1, 0xCC);
247  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
248  diagonalize(&rows[0], &rows[2], &rows[3]);
249  t2 = _mm_unpacklo_epi64(m3, m1);
250  tt = blend_epi16(t2, m2, 0xC0);
251  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
252  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
253  t3 = _mm_unpackhi_epi32(m1, m3);
254  tt = _mm_unpacklo_epi32(m2, t3);
255  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
256  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
257  undiagonalize(&rows[0], &rows[2], &rows[3]);
258 }
259 
261  const uint8_t block[BLAKE3_BLOCK_LEN],
262  uint8_t block_len, uint64_t counter,
263  uint8_t flags) {
264  __m128i rows[4];
265  compress_pre(rows, cv, block, block_len, counter, flags);
266  storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
267  storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
268 }
269 
271  const uint8_t block[BLAKE3_BLOCK_LEN],
272  uint8_t block_len, uint64_t counter,
273  uint8_t flags, uint8_t out[64]) {
274  __m128i rows[4];
275  compress_pre(rows, cv, block, block_len, counter, flags);
276  storeu(xorv(rows[0], rows[2]), &out[0]);
277  storeu(xorv(rows[1], rows[3]), &out[16]);
278  storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
279  storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
280 }
281 
282 INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
283  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
284  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
285  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
286  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
287  v[0] = addv(v[0], v[4]);
288  v[1] = addv(v[1], v[5]);
289  v[2] = addv(v[2], v[6]);
290  v[3] = addv(v[3], v[7]);
291  v[12] = xorv(v[12], v[0]);
292  v[13] = xorv(v[13], v[1]);
293  v[14] = xorv(v[14], v[2]);
294  v[15] = xorv(v[15], v[3]);
295  v[12] = rot16(v[12]);
296  v[13] = rot16(v[13]);
297  v[14] = rot16(v[14]);
298  v[15] = rot16(v[15]);
299  v[8] = addv(v[8], v[12]);
300  v[9] = addv(v[9], v[13]);
301  v[10] = addv(v[10], v[14]);
302  v[11] = addv(v[11], v[15]);
303  v[4] = xorv(v[4], v[8]);
304  v[5] = xorv(v[5], v[9]);
305  v[6] = xorv(v[6], v[10]);
306  v[7] = xorv(v[7], v[11]);
307  v[4] = rot12(v[4]);
308  v[5] = rot12(v[5]);
309  v[6] = rot12(v[6]);
310  v[7] = rot12(v[7]);
311  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
312  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
313  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
314  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
315  v[0] = addv(v[0], v[4]);
316  v[1] = addv(v[1], v[5]);
317  v[2] = addv(v[2], v[6]);
318  v[3] = addv(v[3], v[7]);
319  v[12] = xorv(v[12], v[0]);
320  v[13] = xorv(v[13], v[1]);
321  v[14] = xorv(v[14], v[2]);
322  v[15] = xorv(v[15], v[3]);
323  v[12] = rot8(v[12]);
324  v[13] = rot8(v[13]);
325  v[14] = rot8(v[14]);
326  v[15] = rot8(v[15]);
327  v[8] = addv(v[8], v[12]);
328  v[9] = addv(v[9], v[13]);
329  v[10] = addv(v[10], v[14]);
330  v[11] = addv(v[11], v[15]);
331  v[4] = xorv(v[4], v[8]);
332  v[5] = xorv(v[5], v[9]);
333  v[6] = xorv(v[6], v[10]);
334  v[7] = xorv(v[7], v[11]);
335  v[4] = rot7(v[4]);
336  v[5] = rot7(v[5]);
337  v[6] = rot7(v[6]);
338  v[7] = rot7(v[7]);
339 
340  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
341  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
342  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
343  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
344  v[0] = addv(v[0], v[5]);
345  v[1] = addv(v[1], v[6]);
346  v[2] = addv(v[2], v[7]);
347  v[3] = addv(v[3], v[4]);
348  v[15] = xorv(v[15], v[0]);
349  v[12] = xorv(v[12], v[1]);
350  v[13] = xorv(v[13], v[2]);
351  v[14] = xorv(v[14], v[3]);
352  v[15] = rot16(v[15]);
353  v[12] = rot16(v[12]);
354  v[13] = rot16(v[13]);
355  v[14] = rot16(v[14]);
356  v[10] = addv(v[10], v[15]);
357  v[11] = addv(v[11], v[12]);
358  v[8] = addv(v[8], v[13]);
359  v[9] = addv(v[9], v[14]);
360  v[5] = xorv(v[5], v[10]);
361  v[6] = xorv(v[6], v[11]);
362  v[7] = xorv(v[7], v[8]);
363  v[4] = xorv(v[4], v[9]);
364  v[5] = rot12(v[5]);
365  v[6] = rot12(v[6]);
366  v[7] = rot12(v[7]);
367  v[4] = rot12(v[4]);
368  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
369  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
370  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
371  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
372  v[0] = addv(v[0], v[5]);
373  v[1] = addv(v[1], v[6]);
374  v[2] = addv(v[2], v[7]);
375  v[3] = addv(v[3], v[4]);
376  v[15] = xorv(v[15], v[0]);
377  v[12] = xorv(v[12], v[1]);
378  v[13] = xorv(v[13], v[2]);
379  v[14] = xorv(v[14], v[3]);
380  v[15] = rot8(v[15]);
381  v[12] = rot8(v[12]);
382  v[13] = rot8(v[13]);
383  v[14] = rot8(v[14]);
384  v[10] = addv(v[10], v[15]);
385  v[11] = addv(v[11], v[12]);
386  v[8] = addv(v[8], v[13]);
387  v[9] = addv(v[9], v[14]);
388  v[5] = xorv(v[5], v[10]);
389  v[6] = xorv(v[6], v[11]);
390  v[7] = xorv(v[7], v[8]);
391  v[4] = xorv(v[4], v[9]);
392  v[5] = rot7(v[5]);
393  v[6] = rot7(v[6]);
394  v[7] = rot7(v[7]);
395  v[4] = rot7(v[4]);
396 }
397 
398 INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
399  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
400  // 22/33. Note that this doesn't split the vector into two lanes, as the
401  // AVX2 counterparts do.
402  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
403  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
404  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
405  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
406 
407  // Interleave 64-bit lanes.
408  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
409  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
410  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
411  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
412 
413  vecs[0] = abcd_0;
414  vecs[1] = abcd_1;
415  vecs[2] = abcd_2;
416  vecs[3] = abcd_3;
417 }
418 
419 INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
420  size_t block_offset, __m128i out[16]) {
421  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
422  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
423  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
424  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
425  out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
426  out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
427  out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
428  out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
429  out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
430  out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
431  out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
432  out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
433  out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
434  out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
435  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
436  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
437  for (size_t i = 0; i < 4; ++i) {
438  _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
439  }
440  transpose_vecs(&out[0]);
441  transpose_vecs(&out[4]);
442  transpose_vecs(&out[8]);
443  transpose_vecs(&out[12]);
444 }
445 
446 INLINE void load_counters(uint64_t counter, bool increment_counter,
447  __m128i *out_lo, __m128i *out_hi) {
448  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
449  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
450  const __m128i add1 = _mm_and_si128(mask, add0);
451  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
452  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
453  _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
454  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
455  *out_lo = l;
456  *out_hi = h;
457 }
458 
459 static
460 void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
461  const uint32_t key[8], uint64_t counter,
462  bool increment_counter, uint8_t flags,
463  uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
464  __m128i h_vecs[8] = {
465  set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
466  set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
467  };
468  __m128i counter_low_vec, counter_high_vec;
469  load_counters(counter, increment_counter, &counter_low_vec,
470  &counter_high_vec);
471  uint8_t block_flags = flags | flags_start;
472 
473  for (size_t block = 0; block < blocks; block++) {
474  if (block + 1 == blocks) {
475  block_flags |= flags_end;
476  }
477  __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
478  __m128i block_flags_vec = set1(block_flags);
479  __m128i msg_vecs[16];
480  transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
481 
482  __m128i v[16] = {
483  h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
484  h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
485  set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
486  counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
487  };
488  round_fn(v, msg_vecs, 0);
489  round_fn(v, msg_vecs, 1);
490  round_fn(v, msg_vecs, 2);
491  round_fn(v, msg_vecs, 3);
492  round_fn(v, msg_vecs, 4);
493  round_fn(v, msg_vecs, 5);
494  round_fn(v, msg_vecs, 6);
495  h_vecs[0] = xorv(v[0], v[8]);
496  h_vecs[1] = xorv(v[1], v[9]);
497  h_vecs[2] = xorv(v[2], v[10]);
498  h_vecs[3] = xorv(v[3], v[11]);
499  h_vecs[4] = xorv(v[4], v[12]);
500  h_vecs[5] = xorv(v[5], v[13]);
501  h_vecs[6] = xorv(v[6], v[14]);
502  h_vecs[7] = xorv(v[7], v[15]);
503 
504  block_flags = flags;
505  }
506 
507  transpose_vecs(&h_vecs[0]);
508  transpose_vecs(&h_vecs[4]);
509  // The first four vecs now contain the first half of each output, and the
510  // second four vecs contain the second half of each output.
511  storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
512  storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
513  storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
514  storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
515  storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
516  storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
517  storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
518  storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
519 }
520 
521 INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
522  const uint32_t key[8], uint64_t counter,
523  uint8_t flags, uint8_t flags_start,
524  uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
525  uint32_t cv[8];
526  memcpy(cv, key, BLAKE3_KEY_LEN);
527  uint8_t block_flags = flags | flags_start;
528  while (blocks > 0) {
529  if (blocks == 1) {
530  block_flags |= flags_end;
531  }
533  block_flags);
535  blocks -= 1;
536  block_flags = flags;
537  }
538  memcpy(out, cv, BLAKE3_OUT_LEN);
539 }
540 
541 void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
542  size_t blocks, const uint32_t key[8],
543  uint64_t counter, bool increment_counter,
544  uint8_t flags, uint8_t flags_start,
545  uint8_t flags_end, uint8_t *out) {
546  while (num_inputs >= DEGREE) {
547  blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
548  flags_start, flags_end, out);
549  if (increment_counter) {
550  counter += DEGREE;
551  }
552  inputs += DEGREE;
553  num_inputs -= DEGREE;
554  out = &out[DEGREE * BLAKE3_OUT_LEN];
555  }
556  while (num_inputs > 0) {
557  hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
558  flags_end, out);
559  if (increment_counter) {
560  counter += 1;
561  }
562  inputs += 1;
563  num_inputs -= 1;
564  out = &out[BLAKE3_OUT_LEN];
565  }
566 }
i
i
Definition: README.txt:29
block
we get the following basic block
Definition: README_ALTIVEC.txt:95
BLAKE3_KEY_LEN
#define BLAKE3_KEY_LEN
Definition: blake3_impl.h:16
set1
INLINE __m128i set1(uint32_t x)
Definition: blake3_sse2.c:24
g2
INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
Definition: blake3_sse2.c:56
counter_high
INLINE uint32_t counter_high(uint64_t counter)
Definition: blake3_impl.h:152
set4
INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Definition: blake3_sse2.c:26
counter_low
INLINE uint32_t counter_low(uint64_t counter)
Definition: blake3_impl.h:150
blake3_compress_xof_sse2
void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
Definition: blake3_sse2.c:270
BLAKE3_BLOCK_LEN
#define BLAKE3_BLOCK_LEN
Definition: blake3_impl.h:18
rot7
INLINE __m128i rot7(__m128i x)
Definition: blake3_sse2.c:42
transpose_vecs
INLINE void transpose_vecs(__m128i vecs[DEGREE])
Definition: blake3_sse2.c:398
g1
INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
Definition: blake3_sse2.c:46
a
=0.0 ? 0.0 :(a > 0.0 ? 1.0 :-1.0) a
Definition: README.txt:489
DEGREE
#define DEGREE
Definition: blake3_sse2.c:5
bits
demanded bits
Definition: DemandedBits.cpp:57
loadu
INLINE __m128i loadu(const uint8_t src[16])
Definition: blake3_sse2.c:11
BLAKE3_OUT_LEN
#define BLAKE3_OUT_LEN
Definition: blake3_impl.h:17
_mm_shuffle_ps2
#define _mm_shuffle_ps2(a, b, c)
Definition: blake3_sse2.c:7
b
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int b
Definition: README.txt:418
blake3_hash_many_sse2
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_sse2.c:541
l
This requires reassociating to forms of expressions that are already something that reassoc doesn t think about yet These two functions should generate the same code on big endian int * l
Definition: README.txt:100
addv
INLINE __m128i addv(__m128i a, __m128i b)
Definition: blake3_sse2.c:19
input
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 input
Definition: README.txt:10
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
MSG_SCHEDULE
static const uint8_t MSG_SCHEDULE[7][16]
Definition: blake3_impl.h:89
diagonalize
INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3)
Definition: blake3_sse2.c:69
blake3_compress_in_place_sse2
void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_sse2.c:260
uint64_t
compress_pre
INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_sse2.c:89
round_fn
INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r)
Definition: blake3_sse2.c:282
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
blend_epi16
INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8)
Definition: blake3_sse2.c:81
load_counters
INLINE void load_counters(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi)
Definition: blake3_sse2.c:446
rot12
INLINE __m128i rot12(__m128i x)
Definition: blake3_sse2.c:34
uint32_t
xorv
INLINE __m128i xorv(__m128i a, __m128i b)
Definition: blake3_sse2.c:22
transpose_msg_vecs
INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m128i out[16])
Definition: blake3_sse2.c:419
t1
<%struct.bf ** > define void t1() nounwind ssp
Definition: README.txt:1497
rot16
INLINE __m128i rot16(__m128i x)
Definition: blake3_sse2.c:30
undiagonalize
INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3)
Definition: blake3_sse2.c:75
hash_one_sse2
INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
Definition: blake3_sse2.c:521
x
TODO unsigned x
Definition: README.txt:10
rot8
INLINE __m128i rot8(__m128i x)
Definition: blake3_sse2.c:38
storeu
INLINE void storeu(__m128i src, uint8_t dest[16])
Definition: blake3_sse2.c:15
blake3_hash4_sse2
static void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_sse2.c:460
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
h
the multiplication has a latency of four as opposed to two cycles for the movl lea variant It appears gcc place string data with linkonce linkage in section coalesced instead of section coalesced Take a look at darwin h
Definition: README.txt:261
d
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int int d
Definition: README.txt:418
blake3_impl.h
INLINE
#define INLINE
Definition: blake3_impl.h:40