LLVM  16.0.0git
blake3_dispatch.c
Go to the documentation of this file.
1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4 
5 #include "blake3_impl.h"
6 
7 #if defined(IS_X86)
8 #if defined(_MSC_VER)
9 #include <intrin.h>
10 #elif defined(__GNUC__)
11 #include <immintrin.h>
12 #else
13 #error "Unimplemented!"
14 #endif
15 #endif
16 
17 #define MAYBE_UNUSED(x) (void)((x))
18 
19 #if defined(IS_X86)
20 static uint64_t xgetbv(void) {
21 #if defined(_MSC_VER)
22  return _xgetbv(0);
23 #else
24  uint32_t eax = 0, edx = 0;
25  __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26  return ((uint64_t)edx << 32) | eax;
27 #endif
28 }
29 
30 static void cpuid(uint32_t out[4], uint32_t id) {
31 #if defined(_MSC_VER)
32  __cpuid((int *)out, id);
33 #elif defined(__i386__) || defined(_M_IX86)
34  __asm__ __volatile__("movl %%ebx, %1\n"
35  "cpuid\n"
36  "xchgl %1, %%ebx\n"
37  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38  : "a"(id));
39 #else
40  __asm__ __volatile__("cpuid\n"
41  : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42  : "a"(id));
43 #endif
44 }
45 
46 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47 #if defined(_MSC_VER)
48  __cpuidex((int *)out, id, sid);
49 #elif defined(__i386__) || defined(_M_IX86)
50  __asm__ __volatile__("movl %%ebx, %1\n"
51  "cpuid\n"
52  "xchgl %1, %%ebx\n"
53  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54  : "a"(id), "c"(sid));
55 #else
56  __asm__ __volatile__("cpuid\n"
57  : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58  : "a"(id), "c"(sid));
59 #endif
60 }
61 
62 #endif
63 
65  SSE2 = 1 << 0,
66  SSSE3 = 1 << 1,
67  SSE41 = 1 << 2,
68  AVX = 1 << 3,
69  AVX2 = 1 << 4,
70  AVX512F = 1 << 5,
71  AVX512VL = 1 << 6,
72  /* ... */
73  UNDEFINED = 1 << 30
74 };
75 
76 #if !defined(BLAKE3_TESTING)
77 static /* Allow the variable to be controlled manually for testing */
78 #endif
80 
82 #if !defined(BLAKE3_TESTING)
83 static
84 #endif
85  enum cpu_feature
87 
88  if (g_cpu_features != UNDEFINED) {
89  return g_cpu_features;
90  } else {
91 #if defined(IS_X86)
92  uint32_t regs[4] = {0};
93  uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
94  (void)edx;
95  enum cpu_feature features = 0;
96  cpuid(regs, 0);
97  const int max_id = *eax;
98  cpuid(regs, 1);
99 #if defined(__amd64__) || defined(_M_X64)
100  features |= SSE2;
101 #else
102  if (*edx & (1UL << 26))
103  features |= SSE2;
104 #endif
105  if (*ecx & (1UL << 0))
106  features |= SSSE3;
107  if (*ecx & (1UL << 19))
108  features |= SSE41;
109 
110  if (*ecx & (1UL << 27)) { // OSXSAVE
111  const uint64_t mask = xgetbv();
112  if ((mask & 6) == 6) { // SSE and AVX states
113  if (*ecx & (1UL << 28))
114  features |= AVX;
115  if (max_id >= 7) {
116  cpuidex(regs, 7, 0);
117  if (*ebx & (1UL << 5))
118  features |= AVX2;
119  if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
120  if (*ebx & (1UL << 31))
121  features |= AVX512VL;
122  if (*ebx & (1UL << 16))
123  features |= AVX512F;
124  }
125  }
126  }
127  }
128  g_cpu_features = features;
129  return features;
130 #else
131  /* How to detect NEON? */
132  return 0;
133 #endif
134  }
135 }
136 
138  const uint8_t block[BLAKE3_BLOCK_LEN],
139  uint8_t block_len, uint64_t counter,
140  uint8_t flags) {
141 #if defined(IS_X86)
142  const enum cpu_feature features = get_cpu_features();
143  MAYBE_UNUSED(features);
144 #if !defined(BLAKE3_NO_AVX512)
145  if (features & AVX512VL) {
146  blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
147  return;
148  }
149 #endif
150 #if !defined(BLAKE3_NO_SSE41)
151  if (features & SSE41) {
152  blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
153  return;
154  }
155 #endif
156 #if !defined(BLAKE3_NO_SSE2)
157  if (features & SSE2) {
158  blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
159  return;
160  }
161 #endif
162 #endif
163  blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
164 }
165 
166 void blake3_compress_xof(const uint32_t cv[8],
167  const uint8_t block[BLAKE3_BLOCK_LEN],
168  uint8_t block_len, uint64_t counter, uint8_t flags,
169  uint8_t out[64]) {
170 #if defined(IS_X86)
171  const enum cpu_feature features = get_cpu_features();
172  MAYBE_UNUSED(features);
173 #if !defined(BLAKE3_NO_AVX512)
174  if (features & AVX512VL) {
175  blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
176  return;
177  }
178 #endif
179 #if !defined(BLAKE3_NO_SSE41)
180  if (features & SSE41) {
181  blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
182  return;
183  }
184 #endif
185 #if !defined(BLAKE3_NO_SSE2)
186  if (features & SSE2) {
187  blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
188  return;
189  }
190 #endif
191 #endif
192  blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
193 }
194 
195 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
196  size_t blocks, const uint32_t key[8], uint64_t counter,
197  bool increment_counter, uint8_t flags,
198  uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
199 #if defined(IS_X86)
200  const enum cpu_feature features = get_cpu_features();
201  MAYBE_UNUSED(features);
202 #if !defined(BLAKE3_NO_AVX512)
203  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
204  blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
205  increment_counter, flags, flags_start, flags_end,
206  out);
207  return;
208  }
209 #endif
210 #if !defined(BLAKE3_NO_AVX2)
211  if (features & AVX2) {
212  blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
213  increment_counter, flags, flags_start, flags_end,
214  out);
215  return;
216  }
217 #endif
218 #if !defined(BLAKE3_NO_SSE41)
219  if (features & SSE41) {
220  blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
221  increment_counter, flags, flags_start, flags_end,
222  out);
223  return;
224  }
225 #endif
226 #if !defined(BLAKE3_NO_SSE2)
227  if (features & SSE2) {
228  blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
229  increment_counter, flags, flags_start, flags_end,
230  out);
231  return;
232  }
233 #endif
234 #endif
235 
236 #if BLAKE3_USE_NEON == 1
237  blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
238  increment_counter, flags, flags_start, flags_end, out);
239  return;
240 #endif
241 
242  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
243  increment_counter, flags, flags_start, flags_end,
244  out);
245 }
246 
247 // The dynamically detected SIMD degree of the current platform.
248 size_t blake3_simd_degree(void) {
249 #if defined(IS_X86)
250  const enum cpu_feature features = get_cpu_features();
251  MAYBE_UNUSED(features);
252 #if !defined(BLAKE3_NO_AVX512)
253  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
254  return 16;
255  }
256 #endif
257 #if !defined(BLAKE3_NO_AVX2)
258  if (features & AVX2) {
259  return 8;
260  }
261 #endif
262 #if !defined(BLAKE3_NO_SSE41)
263  if (features & SSE41) {
264  return 4;
265  }
266 #endif
267 #if !defined(BLAKE3_NO_SSE2)
268  if (features & SSE2) {
269  return 4;
270  }
271 #endif
272 #endif
273 #if BLAKE3_USE_NEON == 1
274  return 4;
275 #endif
276  return 1;
277 }
block
we get the following basic block
Definition: README_ALTIVEC.txt:95
LLVM_ATTRIBUTE_USED
#define LLVM_ATTRIBUTE_USED
Definition: Compiler.h:139
ecx
Instead of the following for memset char edx edx edx It might be better to generate eax movl edx movl edx movw edx when we can spare a register It reduces code size Evaluate what the best way to codegen sdiv C is For we currently get ret i32 Y eax movl ecx ecx ecx addl ecx
Definition: README.txt:147
eax
Add support for conditional and other related patterns Instead eax eax je LBB16_2 eax edi eax movl eax
Definition: README.txt:145
UNDEFINED
@ UNDEFINED
Definition: blake3_dispatch.c:73
SSE41
@ SSE41
Definition: blake3_dispatch.c:67
blake3_compress_xof_sse2
void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
Definition: blake3_sse2.c:270
BLAKE3_BLOCK_LEN
#define BLAKE3_BLOCK_LEN
Definition: blake3_impl.h:18
blake3_compress_in_place_portable
LLVM_LIBRARY_VISIBILITY void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_portable.c:84
cpu_feature
cpu_feature
Definition: blake3_dispatch.c:64
blake3_compress_in_place_avx512
void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_avx512.c:299
get_cpu_features
static LLVM_ATTRIBUTE_USED enum cpu_feature get_cpu_features(void)
Definition: blake3_dispatch.c:86
blake3_compress_xof_avx512
void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
Definition: blake3_avx512.c:287
edx
sar eax, 31) more aggressively edx
Definition: README.txt:923
blake3_hash_many_sse2
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_sse2.c:541
MAYBE_UNUSED
#define MAYBE_UNUSED(x)
Definition: blake3_dispatch.c:17
AVX
@ AVX
Definition: blake3_dispatch.c:68
SSE2
@ SSE2
Definition: blake3_dispatch.c:65
blake3_hash_many
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_dispatch.c:195
blake3_hash_many_avx512
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_avx512.c:1162
blake3_compress_in_place_sse2
void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_sse2.c:260
uint64_t
g_cpu_features
static enum cpu_feature g_cpu_features
Definition: blake3_dispatch.c:79
blake3_hash_many_avx2
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_avx2.c:303
blake3_compress_xof
void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
Definition: blake3_dispatch.c:166
AVX512VL
@ AVX512VL
Definition: blake3_dispatch.c:71
blake3_hash_many_portable
LLVM_LIBRARY_VISIBILITY void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_portable.c:145
SSSE3
@ SSSE3
Definition: blake3_dispatch.c:66
uint32_t
blake3_compress_xof_sse41
void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
Definition: blake3_sse41.c:264
AVX512F
@ AVX512F
Definition: blake3_dispatch.c:70
blake3_compress_in_place_sse41
void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_sse41.c:254
blake3_compress_xof_portable
LLVM_LIBRARY_VISIBILITY void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
Definition: blake3_portable.c:100
AVX2
@ AVX2
Definition: blake3_dispatch.c:69
blake3_hash_many_sse41
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_sse41.c:535
shuffles::mask
auto mask(ShuffFunc S, unsigned Length, OptArgs... args) -> MaskT
Definition: HexagonISelDAGToDAGHVX.cpp:903
blake3_impl.h
blake3_compress_in_place
void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_dispatch.c:137
ebx
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction and doesn t stress bit subreg eax eax movl edx edx sall eax sall cl edx bit we should expand to a conditional branch like GCC produces Some isel and Sequencing of Instructions Scheduling for reduced register pressure E g Minimum Register Instruction Sequence load p Because the compare isn t it is not matched with the load on both sides The dag combiner should be made smart enough to canonicalize the load into the RHS of a compare when it can invert the result of the compare for free In many LLVM generates code like eax cmpl esp setl al movzbl eax ret on some it is more efficient to do ebx xor eax cmpl ebx
Definition: README.txt:97
blake3_simd_degree
size_t blake3_simd_degree(void)
Definition: blake3_dispatch.c:248