Consider this C++ program: ########################################## struct Foo { void* tagged; Foo* follow(void); Foo* collect(unsigned = 1); }; Foo* Foo::follow(void) { int t = (unsigned long)tagged & 0x3; switch (t) { case 0: case 1: return (this + 1)->follow(); case 3: return this + 1; case 2: return (this + 1)->collect(); } } Foo* Foo::collect(unsigned acc) { int t = (unsigned long)tagged & 0x3; switch (t) { case 0: case 1: return (this + 1)->collect((acc << 1) | t); case 3: return this + 1; case 2: return this + 1 + acc; } } ########################################## Clang compiles the second function to: ########################################## define %struct.Foo* @_ZN3Foo7collectEj(%struct.Foo* %this, i32 %acc) nounwind readonly align 2 { entry: br label %tailrecurse tailrecurse: ; preds = %sw.bb, %entry %indvar = phi i64 [ %indvar.next, %sw.bb ], [ 0, %entry ] %acc.tr = phi i32 [ %or, %sw.bb ], [ %acc, %entry ] %tmp = getelementptr inbounds %struct.Foo* %this, i64 %indvar, i32 0 %tmp2 = load i8** %tmp, align 8 %0 = ptrtoint i8* %tmp2 to i64 %and = and i64 %0, 3 %conv = trunc i64 %and to i32 switch i32 %conv, label %sw.epilog [ i32 0, label %sw.bb i32 1, label %sw.bb i32 3, label %sw.bb6 i32 2, label %sw.bb8 ] sw.bb: ; preds = %tailrecurse, %tailrecurse %shl = shl i32 %acc.tr, 1 %or = or i32 %conv, %shl %indvar.next = add i64 %indvar, 1 br label %tailrecurse sw.bb6: ; preds = %tailrecurse %this.tr.sum21 = add i64 %indvar, 1 %add.ptr7 = getelementptr inbounds %struct.Foo* %this, i64 %this.tr.sum21 ret %struct.Foo* %add.ptr7 sw.bb8: ; preds = %tailrecurse %idx.ext = zext i32 %acc.tr to i64 %add.ptr9.sum = add i64 %idx.ext, 1 %this.tr.sum = add i64 %indvar, %add.ptr9.sum %add.ptr11 = getelementptr inbounds %struct.Foo* %this, i64 %this.tr.sum ret %struct.Foo* %add.ptr11 sw.epilog: ; preds = %tailrecurse ret %struct.Foo* undef } ########################################## With a pretty switch statement inside. As an aside, I get clang warnings: Release+Asserts/bin/clang++ -O2 switch.cpp -o switch.O2.ll -emit-llvm -S -fno-exceptions switch.cpp:21:1: warning: control may reach end of non-void function [-Wreturn-type] } ^ switch.cpp:35:1: warning: control may reach end of non-void function [-Wreturn-type] } ^ 2 warnings generated. These are an inability in clang to see that the AND mask has 2 bits, which admits 4 combinations and that all are covered in the switch instruction. Also the default switch arm (returning undefined arises from this). Meta-question can the switch be written as: "switch i32 %conv, undefined [...]"? Okay, now let's look at the generated assembly (x86-64) of the first function: ########################################## _ZN3Foo6followEv: .Leh_func_begin0: pushq %rbp .Ltmp0: movq %rsp, %rbp .Ltmp1: movl $2, %ecx movq %rdi, %rdx leaq 8(%rdi), %rax leaq 16(%rdi), %rsi jmp .LBB0_1 .align 16, 0x90 .LBB0_9: addq $8, %rax incq %rcx addq $8, %rsi .LBB0_1: movl -16(%rsi), %edi andl $3, %edi ;; we should jz here cmpl $2, %edi jb .LBB0_9 cmpl $3, %edi ;; no need for this compare je .LBB0_10 cmpl $2, %edi ;; no need for this compare jne .LBB0_13 movl $1, %eax .align 16, 0x90 .LBB0_5: movl -8(%rsi), %edi andl $3, %edi ;; we should jz here cmpl $3, %edi ;; comparing with 2 would allow "trisection" je .LBB0_11 cmpl $2, %edi ;; no need for this compare je .LBB0_12 cmpl $1, %edi ;; no need for this compare ja .LBB0_13 addl %eax, %eax orl %edi, %eax incq %rcx addq $8, %rsi jmp .LBB0_5 .LBB0_10: popq %rbp ret .LBB0_11: movq %rsi, %rax popq %rbp ret .LBB0_12: movl %eax, %eax addq %rcx, %rax leaq (%rdx,%rax,8), %rax .LBB0_13: popq %rbp ret .Ltmp2: .size _ZN3Foo6followEv, .Ltmp2-_ZN3Foo6followEv .Leh_func_end0: ########################################## I have commented stuff that looks very inefficient. I can imagine a target independent lowering of a 2-bit value domain (the AND-mask having popcnt=2 with the zero-value peeled off immediately after the AND) like this: Enumerate the 3 possible values in order: x < y < z compare against y, if eq ---> leg for y if smaller ---> leg for x if bigger ---> leg for z Look 'ma, only one compare! When the 2 bits of the mask are adjacent, then on targets which have a rcr (rotate with carry) one could rotate the upper bit into the carry flag and the next bit would end up in the minus flag. This would allow a 4-way branch. On other targets (PPC) we could move to condition code register and have a multi-way branch too. Links: http://docs.sun.com/app/docs/doc/802-1948/6i5uqa9p5?l=en&a=view
(In reply to comment #0) > ########################################## > > Clang compiles the second function to: > > ########################################## Should add the clang invocation: Release+Asserts/bin/clang++ -O2 switch.cpp -o switch.O2.ll [-emit-llvm] -S -fno-exceptions
first subproblem solved: http://llvm.org/viewvc/llvm-project?view=rev&revision=113668 (One test still fails, new example test to be filecheckized)
(In reply to comment #2) > first subproblem solved: > > http://llvm.org/viewvc/llvm-project?view=rev&revision=113668 > > (One test still fails, new example test to be filecheckized) These commits of Bill http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20100906/107781.html and around may be interesting for switching on predication on PPC. Would be nice to see the effect of my commit on ARM with those peephole changes.
My branch https://llvm.org/svn/llvm-project/llvm/branches/ggreif/switch-opts already shows this improvement: .file "/home/gabor/llvm/test/CodeGen/X86/switch-and.ll" .text .globl _ZN3Foo7collectEj .align 16, 0x90 .type _ZN3Foo7collectEj,@function _ZN3Foo7collectEj: # @_ZN3Foo7collectEj # BB#0: # %entry movl $1, %ecx movq %rdi, %rdx leaq 8(%rdi), %rax jmp .LBB0_1 .align 16, 0x90 .LBB0_4: # %sw.bb # in Loop: Header=BB0_1 Depth=1 addl %esi, %esi orl %edi, %esi addq $8, %rax incq %rcx .LBB0_1: # %tailrecurse # =>This Inner Loop Header: Depth=1 movl -8(%rax), %edi andl $3, %edi je .LBB0_4 # BB#2: # %nz # in Loop: Header=BB0_1 Depth=1 cmpl $2, %edi je .LBB0_6 # BB#3: # %nz.non-middle # in Loop: Header=BB0_1 Depth=1 cmpl $2, %edi jbe .LBB0_4 # BB#5: # %sw.bb6 ret .LBB0_6: # %sw.bb8 movl %esi, %eax addq %rcx, %rax leaq (%rdx,%rax,8), %rax ret .Ltmp0: .size _ZN3Foo7collectEj, .Ltmp0-_ZN3Foo7collectEj .section .note.GNU-stack,"",@progbits
*** This bug has been marked as a duplicate of bug 774 ***