8125 – switches over a 2-bit domain lowered inefficiently

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 8125 - switches over a 2-bit domain lowered inefficiently

Summary: switches over a 2-bit domain lowered inefficiently

Status:	RESOLVED DUPLICATE of bug 774

Alias:	None

Product:	libraries
Classification:	Unclassified
Component:	Backend: X86 (show other bugs)
Version:	trunk
Hardware:	All All

Importance:	P normal
Assignee:	Unassigned LLVM Bugs

URL:
Keywords:

Depends on:	8361
Blocks:	6809
	Show dependency tree

Reported:	2010-09-10 08:37 PDT by Gabor Greif
Modified:	2010-12-02 01:13 PST (History)
CC List:	4 users (show)

See Also:
Fixed By Commit(s):

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Gabor Greif 2010-09-10 08:37:16 PDT

Consider this C++ program:

##########################################
struct Foo {
    void* tagged;
    
    Foo* follow(void);
    Foo* collect(unsigned = 1);
};


Foo* Foo::follow(void) {
    int t = (unsigned long)tagged & 0x3;
    switch (t)
    {
        case 0:
        case 1:
            return (this + 1)->follow();
        case 3:
            return this + 1;
        case 2:
            return (this + 1)->collect();
    }
}

Foo* Foo::collect(unsigned acc) {
    int t = (unsigned long)tagged & 0x3;
    switch (t)
    {
        case 0:
        case 1:
            return (this + 1)->collect((acc << 1) | t);
        case 3:
            return this + 1;
        case 2:
            return this + 1 + acc;
    }
}
##########################################

Clang compiles the second function to:

##########################################
define %struct.Foo* @_ZN3Foo7collectEj(%struct.Foo* %this, i32 %acc) nounwind readonly align 2 {
entry:
  br label %tailrecurse

tailrecurse:                                      ; preds = %sw.bb, %entry
  %indvar = phi i64 [ %indvar.next, %sw.bb ], [ 0, %entry ]
  %acc.tr = phi i32 [ %or, %sw.bb ], [ %acc, %entry ]
  %tmp = getelementptr inbounds %struct.Foo* %this, i64 %indvar, i32 0
  %tmp2 = load i8** %tmp, align 8
  %0 = ptrtoint i8* %tmp2 to i64
  %and = and i64 %0, 3
  %conv = trunc i64 %and to i32
  switch i32 %conv, label %sw.epilog [
    i32 0, label %sw.bb
    i32 1, label %sw.bb
    i32 3, label %sw.bb6
    i32 2, label %sw.bb8
  ]

sw.bb:                                            ; preds = %tailrecurse, %tailrecurse
  %shl = shl i32 %acc.tr, 1
  %or = or i32 %conv, %shl
  %indvar.next = add i64 %indvar, 1
  br label %tailrecurse

sw.bb6:                                           ; preds = %tailrecurse
  %this.tr.sum21 = add i64 %indvar, 1
  %add.ptr7 = getelementptr inbounds %struct.Foo* %this, i64 %this.tr.sum21
  ret %struct.Foo* %add.ptr7

sw.bb8:                                           ; preds = %tailrecurse
  %idx.ext = zext i32 %acc.tr to i64
  %add.ptr9.sum = add i64 %idx.ext, 1
  %this.tr.sum = add i64 %indvar, %add.ptr9.sum
  %add.ptr11 = getelementptr inbounds %struct.Foo* %this, i64 %this.tr.sum
  ret %struct.Foo* %add.ptr11

sw.epilog:                                        ; preds = %tailrecurse
  ret %struct.Foo* undef
}
##########################################

With a pretty switch statement inside.

As an aside, I get clang warnings:
Release+Asserts/bin/clang++ -O2 switch.cpp -o switch.O2.ll -emit-llvm -S -fno-exceptions
switch.cpp:21:1: warning: control may reach end of non-void function [-Wreturn-type]
}
^
switch.cpp:35:1: warning: control may reach end of non-void function [-Wreturn-type]
}
^
2 warnings generated.

These are an inability in clang to see that the AND mask has 2 bits, which admits 4 combinations and that all are covered in the switch instruction.
Also the default switch arm (returning undefined arises from this). Meta-question can the switch be written as: "switch i32 %conv, undefined [...]"?

Okay, now let's look at the generated assembly (x86-64) of the first function:

##########################################
_ZN3Foo6followEv:
.Leh_func_begin0:
	pushq	%rbp
.Ltmp0:
	movq	%rsp, %rbp
.Ltmp1:
	movl	$2, %ecx
	movq	%rdi, %rdx
	leaq	8(%rdi), %rax
	leaq	16(%rdi), %rsi
	jmp	.LBB0_1
	.align	16, 0x90
.LBB0_9:
	addq	$8, %rax
	incq	%rcx
	addq	$8, %rsi
.LBB0_1:
	movl	-16(%rsi), %edi
	andl	$3, %edi
				;;  we should jz here
	cmpl	$2, %edi
	jb	.LBB0_9
	cmpl	$3, %edi
				;; no need for this compare
	je	.LBB0_10
	cmpl	$2, %edi
				;; no need for this compare
	jne	.LBB0_13
	movl	$1, %eax
	.align	16, 0x90
.LBB0_5:
	movl	-8(%rsi), %edi
	andl	$3, %edi
				;;  we should jz here
	cmpl	$3, %edi
				;; comparing with 2 would allow "trisection"
	je	.LBB0_11
	cmpl	$2, %edi
				;; no need for this compare
	je	.LBB0_12
	cmpl	$1, %edi
				;; no need for this compare
	ja	.LBB0_13
	addl	%eax, %eax
	orl	%edi, %eax
	incq	%rcx
	addq	$8, %rsi
	jmp	.LBB0_5
.LBB0_10:
	popq	%rbp
	ret
.LBB0_11:
	movq	%rsi, %rax
	popq	%rbp
	ret
.LBB0_12:
	movl	%eax, %eax
	addq	%rcx, %rax
	leaq	(%rdx,%rax,8), %rax
.LBB0_13:
	popq	%rbp
	ret
.Ltmp2:
	.size	_ZN3Foo6followEv, .Ltmp2-_ZN3Foo6followEv
.Leh_func_end0:
##########################################


I have commented stuff that looks very inefficient. I can imagine a target independent lowering of a 2-bit value domain (the AND-mask having popcnt=2 with the zero-value peeled off immediately after the AND) like this:

Enumerate the 3 possible values in order:

x < y < z

compare against y, if eq ---> leg for y
                   if smaller  ---> leg for x
                   if bigger  ---> leg for z

Look 'ma, only one compare!

When the 2 bits of the mask are adjacent, then on targets which have a rcr (rotate with carry) one could rotate the upper bit into the carry flag and the next bit would end up in the minus flag. This would allow a 4-way branch.
On other targets (PPC) we could move to condition code register and have a multi-way branch too.

Links: http://docs.sun.com/app/docs/doc/802-1948/6i5uqa9p5?l=en&a=view

Comment 1 Gabor Greif 2010-09-10 08:43:50 PDT

(In reply to comment #0)
> ##########################################
> 
> Clang compiles the second function to:
> 
> ##########################################

Should add the clang invocation:

Release+Asserts/bin/clang++ -O2 switch.cpp -o switch.O2.ll [-emit-llvm] -S -fno-exceptions

Comment 2 Gabor Greif 2010-09-10 19:08:19 PDT

first subproblem solved:

http://llvm.org/viewvc/llvm-project?view=rev&revision=113668

(One test still fails, new example test to be filecheckized)

Comment 3 Gabor Greif 2010-09-11 02:03:03 PDT

(In reply to comment #2)
> first subproblem solved:
> 
> http://llvm.org/viewvc/llvm-project?view=rev&revision=113668
> 
> (One test still fails, new example test to be filecheckized)

These commits of Bill
http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20100906/107781.html
and around may be interesting for switching on predication on PPC.

Would be nice to see the effect of my commit on ARM with those peephole changes.

Comment 4 Gabor Greif 2010-10-06 20:18:46 PDT

My branch
https://llvm.org/svn/llvm-project/llvm/branches/ggreif/switch-opts
already shows this improvement:


        .file   "/home/gabor/llvm/test/CodeGen/X86/switch-and.ll"
        .text
        .globl  _ZN3Foo7collectEj
        .align  16, 0x90
        .type   _ZN3Foo7collectEj,@function
_ZN3Foo7collectEj:                      # @_ZN3Foo7collectEj
# BB#0:                                 # %entry
        movl    $1, %ecx
        movq    %rdi, %rdx
        leaq    8(%rdi), %rax
        jmp     .LBB0_1
        .align  16, 0x90
.LBB0_4:                                # %sw.bb
                                        #   in Loop: Header=BB0_1 Depth=1
        addl    %esi, %esi
        orl     %edi, %esi
        addq    $8, %rax
        incq    %rcx
.LBB0_1:                                # %tailrecurse
                                        # =>This Inner Loop Header: Depth=1
        movl    -8(%rax), %edi
        andl    $3, %edi
        je      .LBB0_4
# BB#2:                                 # %nz
                                        #   in Loop: Header=BB0_1 Depth=1
        cmpl    $2, %edi
        je      .LBB0_6
# BB#3:                                 # %nz.non-middle
                                        #   in Loop: Header=BB0_1 Depth=1
        cmpl    $2, %edi
        jbe     .LBB0_4
# BB#5:                                 # %sw.bb6
        ret
.LBB0_6:                                # %sw.bb8
        movl    %esi, %eax
        addq    %rcx, %rax
        leaq    (%rdx,%rax,8), %rax
        ret
.Ltmp0:
        .size   _ZN3Foo7collectEj, .Ltmp0-_ZN3Foo7collectEj


        .section        .note.GNU-stack,"",@progbits

Comment 5 Chris Lattner 2010-12-02 01:13:41 PST


*** This bug has been marked as a duplicate of bug 774 ***