https://simd.godbolt.org/z/qsKWW1heG void dec(int *base, long long offset, int sel) { int *ptr0 = base + offset + 0; int *ptr6 = base + offset + 6; int *ptr = sel ? ptr0 : ptr6; (*ptr)--; } define void @dec(i32* nocapture %0, i64 %1, i32 %2) { %4 = getelementptr inbounds i32, i32* %0, i64 %1 %5 = getelementptr inbounds i32, i32* %4, i64 6 %6 = icmp eq i32 %2, 0 %7 = select i1 %6, i32* %5, i32* %4 %8 = load i32, i32* %7, align 4, !tbaa !3 %9 = add nsw i32 %8, -1 store i32 %9, i32* %7, align 4, !tbaa !3 ret void } dec: leaq (%rdi,%rsi,4), %rax leaq (%rdi,%rsi,4), %rcx addq $24, %rcx testl %edx, %edx cmovneq %rax, %rcx addl $-1, (%rcx) retq We can reduce the number of complex LEA ops by pulling the common "%rsi,4" scaled offset through the cmov and into the addl address: dec: leaq 24(%rdi), %rcx testl %edx, %edx cmovneq %rdi, %rcx addl $-1, (%rcx,%rsi,4) retq This might be generally achievable by canonicalizing the gep-chain, but for now I'm making this a X86 ticket.
So I think we can perform this in InstCombine: define void @dec(i32* nocapture %0, i64 %1, i32 %2) { %cmp = icmp eq i32 %2, 0 %gep0 = getelementptr inbounds i32, i32* %0, i64 %1 %gep1 = getelementptr inbounds i32, i32* %gep0, i64 6 %sel = select i1 %cmp, i32* %gep1, i32* %gep0 %load = load i32, i32* %sel, align 4 %dec = add nsw i32 %load, -1 store i32 %dec, i32* %sel, align 4 ret void } dec: testl %edx, %edx leaq (%rdi,%rsi,4), %rax leaq 24(%rdi,%rsi,4), %rcx cmovneq %rax, %rcx decl (%rcx) --> define void @dec_opt(i32* nocapture %0, i64 %1, i32 %2) { %cmp = icmp eq i32 %2, 0 %add = add i64 %1, 6 %idx = select i1 %cmp, i64 %add, i64 %1 %gep = getelementptr inbounds i32, i32* %0, i64 %idx %load = load i32, i32* %gep, align 4 %dec = add nsw i32 %load, -1 store i32 %dec, i32* %gep, align 4 ret void } dec_opt: leaq 6(%rsi), %rax testl %edx, %edx cmovneq %rsi, %rax decl (%rdi,%rax,4) so I think we're trying to fold: select(cond, gep(gep(ptr, idx0), idx1), gep(ptr, idx0)) -> gep(ptr, select(cond, add(idx0, idx1), idx0))
(In reply to Simon Pilgrim from comment #1) > > select(cond, gep(gep(ptr, idx0), idx1), gep(ptr, idx0)) > -> > gep(ptr, select(cond, add(idx0, idx1), idx0)) This is just a special case of [Bug #50183] / D105901
Candidate Patch: https://reviews.llvm.org/D106352
Alternative Patch: https://reviews.llvm.org/D106450
Current Codegen: define void @dec(i32* nocapture %base, i64 %offset, i32 %sel) { %tobool.not = icmp eq i32 %sel, 0 %cond.idx = select i1 %tobool.not, i64 6, i64 0 %cond.idx9 = add i64 %cond.idx, %offset %cond = getelementptr i32, i32* %base, i64 %cond.idx9 %load = load i32, i32* %cond, align 4 %dec = add nsw i32 %load, -1 store i32 %dec, i32* %cond, align 4 ret void } dec: xorl %eax, %eax testl %edx, %edx movl $6, %ecx cmovneq %rax, %rcx addq %rsi, %rcx decl (%rdi,%rcx,4) retq We can probably improve this further with: %cond.idx = select i1 %tobool.not, i64 6, i64 0 %cond.idx9 = add i64 %cond.idx, %offset --> %cond.idx9 = add i64 %cond.idx, %offset %cond.idx = select i1 %tobool.not, i64 6, i64 0
(In reply to Simon Pilgrim from comment #5) > Current Codegen: > > define void @dec(i32* nocapture %base, i64 %offset, i32 %sel) { > %tobool.not = icmp eq i32 %sel, 0 > %cond.idx = select i1 %tobool.not, i64 6, i64 0 > %cond.idx9 = add i64 %cond.idx, %offset > %cond = getelementptr i32, i32* %base, i64 %cond.idx9 > %load = load i32, i32* %cond, align 4 > %dec = add nsw i32 %load, -1 > store i32 %dec, i32* %cond, align 4 > ret void > } > > dec: > xorl %eax, %eax > testl %edx, %edx > movl $6, %ecx > cmovneq %rax, %rcx > addq %rsi, %rcx > decl (%rdi,%rcx,4) > retq > > We can probably improve this further with: > > %cond.idx = select i1 %tobool.not, i64 6, i64 0 > %cond.idx9 = add i64 %cond.idx, %offset > > --> > > %cond.idx9 = add i64 %cond.idx, %offset > %cond.idx = select i1 %tobool.not, i64 6, i64 0 I'm guessing you meant > %cond.idx = select i1 %tobool.not, i64 6, i64 0 > %cond.idx9 = add i64 %cond.idx, %offset > > --> > > %cond.idx9 = add i64 %offset, 6 > %cond.idx = select i1 %tobool.not, i64 %cond.idx9, i64 %offset ? That increases use count of %offset, which is generally not the preferred direction, even if that doesn't have undef problems.
(In reply to Roman Lebedev from comment #6) > That increases use count of %offset, which is generally not > the preferred direction, even if that doesn't have undef problems. That's correct in general, but for x86 with cmov, there's an opportunity to reduce constant materialization (since cmov needs register operands). See if this looks right: https://reviews.llvm.org/D106607
(In reply to Roman Lebedev from comment #6) > I'm guessing you meant > > %cond.idx = select i1 %tobool.not, i64 6, i64 0 > > %cond.idx9 = add i64 %cond.idx, %offset > > --> > > %cond.idx9 = add i64 %offset, 6 > > %cond.idx = select i1 %tobool.not, i64 %cond.idx9, i64 %offset Yes - sorry, hot weather (for the UK at least..) has melted my brain and destroyed my copy+paste skills :-(
Not sure if this example suggests a different approach, but we added it with https://reviews.llvm.org/D106684 : %class.btAxis = type { %struct.btBroadphaseProxy.base, [3 x i16], [3 x i16], %struct.btBroadphaseProxy* } %struct.btBroadphaseProxy.base = type <{ i8*, i16, i16, [4 x i8], i8*, i32, [4 x float], [4 x float] }> %struct.btBroadphaseProxy = type <{ i8*, i16, i16, [4 x i8], i8*, i32, [4 x float], [4 x float], [4 x i8] }> define i16* @bullet(i1 %b, %class.btAxis* readnone %ptr, i64 %idx) { %gep2 = getelementptr inbounds %class.btAxis, %class.btAxis* %ptr, i64 %idx, i32 2, i64 0 %gep1 = getelementptr inbounds %class.btAxis, %class.btAxis* %ptr, i64 %idx, i32 1, i64 0 %sel = select i1 %b, i16* %gep1, i16* %gep2 ret i16* %sel } And the suggested improvement to decompose the complex LEA: https://llvm.godbolt.org/z/WMaPvfKKh
We have three x86 codegen folds do far based on this: https://reviews.llvm.org/rGf060aa1cf3f4 https://reviews.llvm.org/rG1ce05ad619a5 https://reviews.llvm.org/rG4c41caa28710 And the example here is based on a sequence in a Bullet benchmark where GCC was doing better: https://www.phoronix.com/scan.php?page=article&item=clang12-gcc11-icelake&num=6 Do we know if we made a dent in that deficit? I'm not sure how to get that benchmark built and running locally.
There are some instructions how to build it here: https://openbenchmarking.org/innhold/99d3a8c1ea3ea71e1edf4aea6bf9af30100f07d5