Created attachment 19032 [details] Test file LLVM is producing unusual code for ADCX and ADOX. The source file below (zboson.cxx) is modified from http://stackoverflow.com/q/33690791/608639. All the tests were run on the same machine. The machine is a 6th gen Core i5 (Skylake). It includes ADX cpu features. ******************** $ cat zboson.cxx #include <x86intrin.h> #include <stdint.h> #define LEN 4 // N = N*64-bit add e.g. 4=256-bit add, 3=192-bit add, ... static unsigned char c = 0; template<int START, int N> struct Repeat { static void add (uint64_t *x, uint64_t *y) { #if defined(__INTEL_COMPILER) const uint64_t* a = x; uint64_t* b = y; #else const long long unsigned int* a = reinterpret_cast<const long long unsigned int*>(x); long long unsigned int* b = reinterpret_cast<long long unsigned int*>(y); #endif c = _addcarryx_u64(c, a[START], b[START], &b[START]); Repeat<START+1, N>::add(x,y); } }; template<int N> struct Repeat<LEN, N> { static void add (uint64_t *x, uint64_t *y) {} }; void sum_unroll(uint64_t *x, uint64_t *y) { Repeat<0,LEN>::add(x,y); } ******************** $ clang++ -g2 -O3 -march=native zboson.cxx -c $ objdump --disassemble zboson.o zboson.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <_Z10sum_unrollPmS_>: 0: 55 push %rbp 1: 48 89 e5 mov %rsp,%rbp 4: 8a 05 00 00 00 00 mov 0x0(%rip),%al # a <_Z10sum_unrollPmS_+0xa> a: 48 8b 0f mov (%rdi),%rcx d: 04 ff add $0xff,%al f: 66 48 0f 38 f6 0e adcx (%rsi),%rcx 15: 48 89 0e mov %rcx,(%rsi) 18: 50 push %rax 19: 48 89 f8 mov %rdi,%rax 1c: 04 7f add $0x7f,%al 1e: 9e sahf 1f: 58 pop %rax 20: 0f 92 c0 setb %al 23: 48 8b 4f 08 mov 0x8(%rdi),%rcx 27: 04 ff add $0xff,%al 29: 66 48 0f 38 f6 4e 08 adcx 0x8(%rsi),%rcx 30: 48 89 4e 08 mov %rcx,0x8(%rsi) 34: 50 push %rax 35: 48 89 f8 mov %rdi,%rax 38: 04 7f add $0x7f,%al 3a: 9e sahf 3b: 58 pop %rax 3c: 0f 92 c0 setb %al 3f: 48 8b 4f 10 mov 0x10(%rdi),%rcx 43: 04 ff add $0xff,%al 45: 66 48 0f 38 f6 4e 10 adcx 0x10(%rsi),%rcx 4c: 48 89 4e 10 mov %rcx,0x10(%rsi) 50: 50 push %rax 51: 48 89 f8 mov %rdi,%rax 54: 04 7f add $0x7f,%al 56: 9e sahf 57: 58 pop %rax 58: 0f 92 c0 setb %al 5b: 48 8b 4f 18 mov 0x18(%rdi),%rcx 5f: 04 ff add $0xff,%al 61: 66 48 0f 38 f6 4e 18 adcx 0x18(%rsi),%rcx 68: 48 89 4e 18 mov %rcx,0x18(%rsi) 6c: 50 push %rax 6d: 48 89 f8 mov %rdi,%rax 70: 04 7f add $0x7f,%al 72: 9e sahf 73: 58 pop %rax 74: 0f 92 c0 setb %al 77: 88 05 00 00 00 00 mov %al,0x0(%rip) # 7d <_Z10sum_unrollPmS_+0x7d> 7d: 5d pop %rbp 7e: c3 retq ******************** $ icc -g2 -O3 -march=native zboson.cxx -c $ objdump --disassemble zboson.o zboson.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <_Z10sum_unrollPmS_>: 0: 45 33 c9 xor %r9d,%r9d 3: 0f b6 05 00 00 00 00 movzbl 0x0(%rip),%eax # a <_Z10sum_unrollPmS_+0xa> a: 44 3b c8 cmp %eax,%r9d d: 48 8b 17 mov (%rdi),%rdx 10: 66 48 0f 38 f6 16 adcx (%rsi),%rdx 16: 48 89 16 mov %rdx,(%rsi) 19: 48 8b 4f 08 mov 0x8(%rdi),%rcx 1d: 66 48 0f 38 f6 4e 08 adcx 0x8(%rsi),%rcx 24: 48 89 4e 08 mov %rcx,0x8(%rsi) 28: 4c 8b 47 10 mov 0x10(%rdi),%r8 2c: 66 4c 0f 38 f6 46 10 adcx 0x10(%rsi),%r8 33: 4c 89 46 10 mov %r8,0x10(%rsi) 37: 4c 8b 57 18 mov 0x18(%rdi),%r10 3b: 66 4c 0f 38 f6 56 18 adcx 0x18(%rsi),%r10 42: 4c 89 56 18 mov %r10,0x18(%rsi) 46: 41 0f 92 c1 setb %r9b 4a: 44 88 0d 00 00 00 00 mov %r9b,0x0(%rip) # 51 <_Z10sum_unrollPmS_+0x51> 51: c3 retq 52: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 59: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) ******************** $ g++ -g2 -O3 -march=native zboson.cxx -c $ objdump --disassemble zboson.o zboson.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <_Z10sum_unrollPmS_>: 0: 0f b6 05 00 00 00 00 movzbl 0x0(%rip),%eax # 7 <_Z10sum_unrollPmS_+0x7> 7: 04 ff add $0xff,%al 9: 48 8b 07 mov (%rdi),%rax c: 48 13 06 adc (%rsi),%rax f: 0f 92 c2 setb %dl 12: 48 89 06 mov %rax,(%rsi) 15: 48 8b 47 08 mov 0x8(%rdi),%rax 19: 80 c2 ff add $0xff,%dl 1c: 48 13 46 08 adc 0x8(%rsi),%rax 20: 0f 92 c2 setb %dl 23: 48 89 46 08 mov %rax,0x8(%rsi) 27: 48 8b 47 10 mov 0x10(%rdi),%rax 2b: 80 c2 ff add $0xff,%dl 2e: 48 13 46 10 adc 0x10(%rsi),%rax 32: 0f 92 c2 setb %dl 35: 48 89 46 10 mov %rax,0x10(%rsi) 39: 48 8b 47 18 mov 0x18(%rdi),%rax 3d: 80 c2 ff add $0xff,%dl 40: 48 13 46 18 adc 0x18(%rsi),%rax 44: 48 89 46 18 mov %rax,0x18(%rsi) 48: 0f 92 05 00 00 00 00 setb 0x0(%rip) # 4f <_Z10sum_unrollPmS_+0x4f> 4f: c3 retq ******************** Switching from addcaryx_u64 to addcarry_u64 produces the same unusual code: c = _addcarry_u64(c, a[START], b[START], &b[START]); Repeat<START+1, N>::add(x,y); $ clang++ -g2 -O3 -march=native zboson.cxx -c $ objdump --disassemble zboson.o zboson.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <_Z10sum_unrollPmS_>: 0: 55 push %rbp 1: 48 89 e5 mov %rsp,%rbp 4: 8a 05 00 00 00 00 mov 0x0(%rip),%al # a <_Z10sum_unrollPmS_+0xa> a: 48 8b 0f mov (%rdi),%rcx d: 04 ff add $0xff,%al f: 66 48 0f 38 f6 0e adcx (%rsi),%rcx 15: 48 89 0e mov %rcx,(%rsi) 18: 50 push %rax 19: 48 89 f8 mov %rdi,%rax 1c: 04 7f add $0x7f,%al 1e: 9e sahf 1f: 58 pop %rax 20: 0f 92 c0 setb %al 23: 48 8b 4f 08 mov 0x8(%rdi),%rcx 27: 04 ff add $0xff,%al 29: 66 48 0f 38 f6 4e 08 adcx 0x8(%rsi),%rcx 30: 48 89 4e 08 mov %rcx,0x8(%rsi) 34: 50 push %rax 35: 48 89 f8 mov %rdi,%rax 38: 04 7f add $0x7f,%al 3a: 9e sahf 3b: 58 pop %rax 3c: 0f 92 c0 setb %al 3f: 48 8b 4f 10 mov 0x10(%rdi),%rcx 43: 04 ff add $0xff,%al 45: 66 48 0f 38 f6 4e 10 adcx 0x10(%rsi),%rcx 4c: 48 89 4e 10 mov %rcx,0x10(%rsi) 50: 50 push %rax 51: 48 89 f8 mov %rdi,%rax 54: 04 7f add $0x7f,%al 56: 9e sahf 57: 58 pop %rax 58: 0f 92 c0 setb %al 5b: 48 8b 4f 18 mov 0x18(%rdi),%rcx 5f: 04 ff add $0xff,%al 61: 66 48 0f 38 f6 4e 18 adcx 0x18(%rsi),%rcx 68: 48 89 4e 18 mov %rcx,0x18(%rsi) 6c: 50 push %rax 6d: 48 89 f8 mov %rdi,%rax 70: 04 7f add $0x7f,%al 72: 9e sahf 73: 58 pop %rax 74: 0f 92 c0 setb %al 77: 88 05 00 00 00 00 mov %al,0x0(%rip) # 7d <_Z10sum_unrollPmS_+0x7d> 7d: 5d pop %rbp 7e: c3 retq ******************** $ clang++ --version clang version 4.0.0 (tags/RELEASE_400/final) Target: x86_64-unknown-linux-gnu Thread model: posix InstalledDir: /bin $ g++ --version g++ (GCC) 7.1.1 20170622 (Red Hat 7.1.1-3) Copyright (C) 2017 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. $ icc --version icc (ICC) 17.0.4 20170411 Copyright (C) 1985-2017 Intel Corporation. All rights reserved.
Related, GCC does not support ADOX and ADCX at the moment. "At the moment" includes GCC 6.4 (Fedora 25) and GCC 7.1 (Fedora 26). GCC effectively disabled the intrinsics, but it still advertises support by defining `__ADX__` in the preprocessor. Also see Issue 67317, "Silly code generation for _addcarry_u32/_addcarry_u64" (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67317). Many thanks to Xi Ruoyao for finding the issue. According to Uros Bizjak on the GCC Help mailing list, GCC may never support the intrinsics (https://gcc.gnu.org/ml/gcc-help/2017-08/msg00100.html). Also see "GCC does not generate ADCX or ADOX for _addcarryx_u64" (https://gcc.gnu.org/ml/gcc-help/2017-08/msg00085.html) on the GCC Help mailing list.
Top of the trunk is showing following ASM sequence 0000000000000000 <_Z10sum_unrollPmS_>: 0: 8a 05 00 00 00 00 mov 0x0(%rip),%al # 6 <_Z10sum_unrollPmS_+0x6> 6: 48 8b 0f mov (%rdi),%rcx 9: 04 ff add $0xff,%al b: 48 13 0e adc (%rsi),%rcx e: 48 89 0e mov %rcx,(%rsi) 11: 48 8b 47 08 mov 0x8(%rdi),%rax 15: 48 13 46 08 adc 0x8(%rsi),%rax 19: 48 89 46 08 mov %rax,0x8(%rsi) 1d: 48 8b 47 10 mov 0x10(%rdi),%rax 21: 48 13 46 10 adc 0x10(%rsi),%rax 25: 48 89 46 10 mov %rax,0x10(%rsi) 29: 48 8b 47 18 mov 0x18(%rdi),%rax 2d: 48 13 46 18 adc 0x18(%rsi),%rax 31: 48 89 46 18 mov %rax,0x18(%rsi) 35: 0f 92 05 00 00 00 00 setb 0x0(%rip) # 3c <_Z10sum_unrollPmS_+0x3c> 3c: c3 retq
Current codegen: https://godbolt.org/z/qtz3Xh We stopped lowering to ADOX/ADCX at rL342059 (llvm 8.0), and like gcc, we're unlikely to go back again unless someone can demonstrate a strong need for FLAG-less codegen (even though it's encoding is longer and can't fold an immediate). _Z10sum_unrollPmS_: # @_Z10sum_unrollPmS_ movb _ZL1c(%rip), %al movq (%rdi), %rcx addb $-1, %al adcq %rcx, (%rsi) movq 8(%rdi), %rax adcq %rax, 8(%rsi) movq 16(%rdi), %rax adcq %rax, 16(%rsi) movq 24(%rdi), %rax adcq %rax, 24(%rsi) setb _ZL1c(%rip) retq What I'm not sure about is using the RMW versions for ADC, which tends to generate a lot of uops/stalls. Especially given that here we could use the load fold versions followed by a store which would (IMO) likely be a lot more performant. Craig? Andrea? Peter?
Agreed RMW ADC/SBB is bad on P6 derived CPUs and the current core CPUs. Less sure about Silvermont/Goldmont and Pentium 4. How is it on AMD?
(In reply to Craig Topper from comment #4) > Agreed RMW ADC/SBB is bad on P6 derived CPUs and the current core CPUs. Less > sure about Silvermont/Goldmont and Pentium 4. How is it on AMD? llvm-mca comparison for skylake: https://godbolt.org/z/YHDQPO which sees the tp drop from 6cy to 5cy/loop, at the expense of a higher instruction count. You can change it to compare other cpus - btver2 suggests that RMW is faster, but it'll have to wait until next week for us to confirm.
(In reply to Simon Pilgrim from comment #3) > Current codegen: https://godbolt.org/z/qtz3Xh > > We stopped lowering to ADOX/ADCX at rL342059 (llvm 8.0), and like gcc, we're > unlikely to go back again unless someone can demonstrate a strong need for > FLAG-less codegen (even though it's encoding is longer and can't fold an > immediate). Just an FYI... I tried to test a cut-in of ADCX and ADOX with GCC inline ASM using Crypto++ in its Integer class (https://github.com/weidai11/cryptopp/blob/master/integer.cpp). At the same time I also took MULX for a test drive using inline ASM. I was not able to obtain [expected?] performance improvements. In fact the big integer operations slowed down considerably. The performance loss may have been my fault and my integration. Barring dumb mistakes on my part I did not feel the instructions performed they way they should. We probably won't use ADCX and ADOX unless Intel speeds up the instructions and makes it profitable to use them. Until our benchmarks show it is profitable, it is dead on arrival. We don't trade fast code for slow code :) (I also observed whitepapers like https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/large-integer-squaring-ia-paper.pdf don't provide benchmarks. In hindsight that should have been a red herring).
Thanks Jeffrey, in which case I'm going to suggest we resolve this and we raise a new bug about how/where we want to generally use RMW instructions. I've added the test case at rL352274.
(In reply to Simon Pilgrim from comment #7) > Thanks Jeffrey, in which case I'm going to suggest we resolve this and we > raise a new bug about how/where we want to generally use RMW instructions. > > I've added the test case at rL352274. Resolving this, I've raised the RMW issue on [Bug #40830].