Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[instcombine comparison] wrong codegen at -O1 (affecting clang trunk, 3.3, and 3.2) #18201

Closed
zhendongsu opened this issue Nov 6, 2013 · 43 comments
Labels
bugzilla Issues migrated from bugzilla

Comments

@zhendongsu
Copy link

Bugzilla Link 17827
Resolution FIXED
Resolved on Dec 19, 2013 12:39
Version trunk
OS All
CC @hfinkel,@isanbard,@nunoplopes

Extended Description

The following code is miscompiled by the current clang trunk (as well as clang 3.2 and 3.3) on x86_64-linux-gnu at -O1 and above in both 32-bit and 64-bit modes.

It is a regression from clang 3.1.

$ clang-trunk -v
clang version 3.4 (trunk 194075)
Target: x86_64-pc-linux-gnu
Thread model: posix
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.4
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.4.6
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.4.7
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6.3
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6
$
$ clang-trunk -O0 small.c; a.out
1
$ clang-3.1 -O1 small.c; a.out
1
$
$ clang-trunk -O1 small.c; a.out
0
$ clang-3.3 -O1 small.c; a.out
0
$ clang-3.2 -O1 small.c; a.out
0
$
$ gcc-trunk -O1 small.c; a.out
1
$ icc -O1 small.c; a.out
1
$


int printf (const char *, ...);

struct S
{
int f0;
int:1;
int f1:8;
int f2;
} a;

int b;

void
foo (int p)
{
struct S c = a;
c.f1 = p & 190;
b = c.f1 < 1;
}

int
main ()
{
foo (174);
printf ("%d\n", b);
return 0;
}

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 15, 2013

Simplified away extra struct fields, printf, global:

$ cat bitfield.c
struct S
{
int f0:8;
int f1:8;
} a;

int foo (int p)
{
struct S c = a;
c.f1 = p & 130;
return c.f1 < 1;
}

int main ()
{
return foo (128);
}

This still fails with -O1.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 15, 2013

Unoptimized IR created by:
$ clang -O0 -fomit-frame-pointer bitfield.c -S -emit-llvm

; ModuleID = 'bitfield.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"

%struct.S = type { [2 x i8], [2 x i8] }

@​a = common global %struct.S zeroinitializer, align 4

; Function Attrs: nounwind ssp uwtable
define i32 @​foo(i32 %p) #​0 {
entry:
%p.addr = alloca i32, align 4
%c = alloca %struct.S, align 4
store i32 %p, i32* %p.addr, align 4
%0 = bitcast %struct.S* %c to i8*
call void @​llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.S* @​a, i32 0, i32 0, i32 0), i64 4, i32 4, i1 false)
%1 = load i32* %p.addr, align 4
%and = and i32 %1, 130
%2 = bitcast %struct.S* %c to i16*
%3 = trunc i32 %and to i16
%bf.load = load i16* %2, align 4
%bf.value = and i16 %3, 255
%bf.shl = shl i16 %bf.value, 8
%bf.clear = and i16 %bf.load, 255
%bf.set = or i16 %bf.clear, %bf.shl
store i16 %bf.set, i16* %2, align 4
%bf.result.shl = shl i16 %bf.value, 8
%bf.result.ashr = ashr i16 %bf.result.shl, 8
%bf.result.cast = sext i16 %bf.result.ashr to i32
%4 = bitcast %struct.S* %c to i16*
%bf.load1 = load i16* %4, align 4
%bf.ashr = ashr i16 %bf.load1, 8
%bf.cast = sext i16 %bf.ashr to i32
%cmp = icmp slt i32 %bf.cast, 1
%conv = zext i1 %cmp to i32
ret i32 %conv
}

; Function Attrs: nounwind
declare void @​llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #​1

; Function Attrs: nounwind ssp uwtable
define i32 @​main() #​0 {
entry:
%retval = alloca i32, align 4
store i32 0, i32* %retval
%call = call i32 @​foo(i32 128)
ret i32 %call
}

attributes #​0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #​1 = { nounwind }

!llvm.ident = !{#0}

!​0 = metadata !{metadata !"clang version 3.4 (trunk 194436)"}

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 15, 2013

Using opt with -O1 on the unoptimized IR:
$ ./opt -O1 bitfield.ll -S -o optbit.ll

Results in:

define i32 @​foo(i32 %p) #​0 {
entry:
%bf.shl = and i32 %p, 130
%cmp = icmp eq i32 %bf.shl, 0
%conv = zext i1 %cmp to i32
ret i32 %conv
}

Something has changed the comparison of '< 1' into a comparison against '0'.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 15, 2013

$ ./opt -O1 -debug-pass=Arguments -S < /dev/null
Pass Arguments: -simplifycfg -domtree -domfrontier -mem2reg -instcombine
Pass Arguments: -globalopt -ipsccp -deadargelim -instcombine -simplifycfg -basiccg -prune-eh -inline -functionattrs -domtree -domfrontier -scalarrepl -simplify-libcalls -instcombine -lazy-value-info -jump-threading -simplifycfg -instcombine -tailcallelim -simplifycfg -reassociate -domtree -loops -loopsimplify -lcssa -loop-rotate -licm -lcssa -loop-unswitch -instcombine -scalar-evolution -loopsimplify -lcssa -iv-users -indvars -loop-deletion -instcombine -memdep -memcpyopt -sccp -instcombine -lazy-value-info -jump-threading -correlated-propagation -domtree -memdep -dse -adce -simplifycfg -strip-dead-prototypes -print-used-types -deadtypeelim -preverify -domtree -verify -print-module

So it must be one of those passes...

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 15, 2013

Even more simplified test case source (doesn't need more than one bitfield, does appear to need the global and a mask constant that doesn't get optimized to something else):

$ cat bitfield.c
struct S
{
int f0:3;
} a;

int foo (int p)
{
struct S c = a;
c.f0 = p & 6;
return c.f0 < 1;
}

int main ()
{
return foo (4);
}

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 15, 2013

More simplified source as unoptimized IR:

$ clang bitfield.c -S -emit-llvm

$ cat bitfield.ll
; ModuleID = 'bitfield.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"

%struct.S = type { i8, [3 x i8] }

@​a = common global %struct.S zeroinitializer, align 4

; Function Attrs: nounwind ssp uwtable
define i32 @​foo(i32 %p) #​0 {
entry:
%p.addr = alloca i32, align 4
%c = alloca %struct.S, align 4
store i32 %p, i32* %p.addr, align 4
%0 = bitcast %struct.S* %c to i8*
call void @​llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.S* @​a, i32 0, i32 0), i64 4, i32 4, i1 false)
%1 = load i32* %p.addr, align 4
%and = and i32 %1, 6
%2 = bitcast %struct.S* %c to i8*
%3 = trunc i32 %and to i8
%bf.load = load i8* %2, align 4
%bf.value = and i8 %3, 7
%bf.clear = and i8 %bf.load, -8
%bf.set = or i8 %bf.clear, %bf.value
store i8 %bf.set, i8* %2, align 4
%bf.result.shl = shl i8 %bf.value, 5
%bf.result.ashr = ashr i8 %bf.result.shl, 5
%bf.result.cast = sext i8 %bf.result.ashr to i32
%4 = bitcast %struct.S* %c to i8*
%bf.load1 = load i8* %4, align 4
%bf.shl = shl i8 %bf.load1, 5
%bf.ashr = ashr i8 %bf.shl, 5
%bf.cast = sext i8 %bf.ashr to i32
%cmp = icmp slt i32 %bf.cast, 1
%conv = zext i1 %cmp to i32
ret i32 %conv
}

; Function Attrs: nounwind
declare void @​llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #​1

; Function Attrs: nounwind ssp uwtable
define i32 @​main() #​0 {
entry:
%retval = alloca i32, align 4
store i32 0, i32* %retval
%call = call i32 @​foo(i32 4)
ret i32 %call
}

attributes #​0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #​1 = { nounwind }

!llvm.ident = !{#0}

!​0 = metadata !{metadata !"clang version 3.4 (trunk 194436)"}

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 15, 2013

And bug repros with just 2 optimization passes:

$ opt -sroa -instcombine bitfield.ll -S -o bitfieldbug.ll
$ clang bitfieldbug.ll ; ./a.out ; echo $?
0

Optimized IR in this case is:

$ cat bitfieldbug.ll
...
define i32 @​foo(i32 %p) #​0 {
entry:
%bf.shl = and i32 %p, 6
%cmp = icmp eq i32 %bf.shl, 0
%conv = zext i1 %cmp to i32
ret i32 %conv
}
...

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 15, 2013

-debug output shows:
INSTCOMBINE ITERATION #​1 on foo
...
IC: Old = %cmp = icmp slt i8 %bf.shl, 1
New = = icmp eq i8 %bf.shl, 0

So the comparison of "<1" is being mistakenly transformed into "==0" here: a signed value is being mistakenly treated as an unsigned value.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 16, 2013

Hmmm...given the IR, I think the instcombine optimizer is actually doing the right thing here.

It recognizes that for an 8 bit signed value (field f0), "p & 6" can never be negative, so for the comparison to less than 1, the only remaining value is 0.

So now the question is:
Why is this struct with a 3-bit bitfield:
struct S
{
int f0:3;
} a;

Defined like this:
%struct.S = type { i8, [3 x i8] }

Unless I'm missing something, the IR has lost a fundamental piece of information from the original source: there's no way to know that the 3rd bit of f0 is the sign bit now.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 17, 2013

I started a thread on llvm-dev to discuss this:
http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-November/067814.html

The original test case (and my simplified test cases) could be resolved as invalid because they do not specify signed/unsigned for the bitfield(s). In that case (and I think this is crazy), the C standard specifies that signedness is implementation defined and C++ has just recently fixed this:

http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#739

Thank you to Mark Lacey and Richard Smith for pointing this out.

However, there is still a bug in llvm because its implementation behaves like we expected - plain 'int' is treated like 'signed int'.

Henrique Santos noted that the unoptimized IR does preserve the sign bit with all of those shift left/right ops. So there is a bug in InstCombine.

I'm still not sure what advantage the front-end / clang has in treating bit fields as 8-bit operands ('i8') rather than just using the actual number of bits, eg 'i3'. All that shifting can lead to other codegen problems as seen in bug 17956.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 20, 2013

This is the IR after sroa and heading into instcombine:

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"

%struct.S = type { i8, [3 x i8] }

@​a = common global %struct.S zeroinitializer, align 4

define i32 @​foo(i32 %p) #​0 {
entry:
%c.sroa.4 = alloca [3 x i8]
%c.sroa.0.0.copyload = load i8* getelementptr inbounds (%struct.S* @​a, i64 0, i32 0), align 4
%c.sroa.4.0.idx = getelementptr inbounds [3 x i8]* %c.sroa.4, i64 0, i64 0
call void @​llvm.memcpy.p0i8.p0i8.i64(i8* %c.sroa.4.0.idx, i8* getelementptr inbounds (%struct.S* @​a, i64 0, i32 1, i64 0), i64 3, i32 1, i1 false)
%and = and i32 %p, 6
%0 = trunc i32 %and to i8
%bf.value = and i8 %0, 7
%bf.clear = and i8 %c.sroa.0.0.copyload, -8
%bf.set = or i8 %bf.clear, %bf.value
%bf.result.shl = shl i8 %bf.value, 5
%bf.result.ashr = ashr i8 %bf.result.shl, 5
%bf.shl = shl i8 %bf.set, 5
%bf.ashr = ashr i8 %bf.shl, 5
%bf.cast = sext i8 %bf.ashr to i32
%cmp = icmp slt i32 %bf.cast, 1
%conv = zext i1 %cmp to i32
ret i32 %conv
}

; Function Attrs: nounwind
declare void @​llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #​1

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 20, 2013

Further simplified IR that still fails in instcombine (don't need any globals/structs):

define i1 @​foo(i8 %p, i8 %q) #​0 {
entry:
%andp = and i8 %p, 6
%andq = and i8 %q, 8
%or = or i8 %andq, %andp
%shl = shl i8 %or, 5
%ashr = ashr i8 %shl, 5
%cmp = icmp slt i8 %ashr, 1
ret i1 %cmp
}

Note: the shifts appear to be impossible to generate with straight C code that just uses 'char' because of C's integer promotion rules.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 20, 2013

Finally, based on the debug output, here's the most simplified code before the incorrect optimization takes place:

define i1 @​foo(i8 %p) #​0 {
entry:
%shlp = shl i8 %p, 5
%andp = and i8 %shlp, -64
%cmp = icmp slt i8 %andp, 32
ret i1 %cmp
}

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 21, 2013

Henrique suggested that this particular optimization:
((x shl 5) and -64) slt 32 -> (x and -64) slt 1

or perhaps more easily readable by humans:
((x mul 32) and 0xC0) slt 32 -> (x and 0xC0) slt 1

should only be done if the shift instruction has an 'nsw' flag or if the comparison was unsigned (ult).

Adding those constraints in InstCombiner::visitICmpInstWithInstAndIntCst() via:

    bool CanFold = Shift->isLogicalShift()
    && (Shift->hasNoSignedWrap() || !ICI.isSigned())
    ;

Causes 6 existing test failures.

This one file dealing with comparison optimizations is over 3K lines of code that only a compiler person could love...there will be bugs.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 21, 2013

Henrique suggested that this particular optimization:
((x shl 5) and -64) slt 32 -> (x and -64) slt 1

or perhaps more easily readable by humans:
((x mul 32) and 0xC0) slt 32 -> (x and 0xC0) slt 1

Forgot to shift the mask value:
((x mul 32) and 0xC0) slt 32 -> (x and 0x06) slt 1

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 21, 2013

For reference, here's what broke with that one line change:

$ llvm-lit ../llvm/test/Transforms/InstCombine/
...
Failing Tests (6):
LLVM :: Transforms/InstCombine/2006-09-15-CastToBool.ll
LLVM :: Transforms/InstCombine/2007-03-25-BadShiftMask.ll
LLVM :: Transforms/InstCombine/apint-shift.ll
LLVM :: Transforms/InstCombine/icmp.ll
LLVM :: Transforms/InstCombine/set.ll
LLVM :: Transforms/InstCombine/shift.ll

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 21, 2013

LLVM :: Transforms/InstCombine/2006-09-15-CastToBool.ll

This is based on bug 913.

LLVM :: Transforms/InstCombine/2007-03-25-BadShiftMask.ll

This is based on bug 1271.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 21, 2013

Phew...all of the failures were due to the fact that (strangely) you can't ask a logical right shift if it hasNoSignedWrap()...you'd think a BinaryOperator that isn't an OverflowingBinaryOperator would just return false for that, but no, it asserts on the cast.

By checking for left shift before asking about the signed wrap, all tests are passing again.

It is frightening how little logic some of these tests actually confirm and how ad-hoc the testing is...

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 21, 2013

Patch submitted for review.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 25, 2013

Evan Cheng looked at the patch and this bug report (thanks!) and suggested:
"Perhaps you can use Nuno's technique to verify the optimization with a SMT solver?"

This is the first I've heard of this work - I found:
http://web.ist.utl.pt/nuno.lopes/pubs/cork-spin13.pdf

And I see there was a presentation at an LLVM Dev Mtg:
http://llvm.org/devmtg/2013-11/#talk5

I haven't located any slides from that talk though.

Is there any testing machinery already in LLVM that I can do this with?

cc'ing Evan (I missed your reply on the commits list until now - too much traffic!) and Nuno.

@nunoplopes
Copy link
Member

Evan Cheng looked at the patch and this bug report (thanks!) and suggested:
"Perhaps you can use Nuno's technique to verify the optimization with a SMT
solver?"

Please ignore that paper on CORK for now, since it's theorical work.
The slides from my tal can be found here: http://web.ist.utl.pt/~nuno.lopes/pres/llvm-devconf13.pptx

Currently there's no machinery, so the translation to SMT has to be done by hand. I did a simplified version of the optimization for Shl and LshR, which are the cases you are tyring to change.

I couldn't find any bug in the mentioned instcombine transformation. You can find the proof here:
http://rise4fun.com/Z3/IGjx

Unless I've missed some detail of the code, Z3 says it is correct and no patch is needed!

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 26, 2013

Thanks, Nuno! That is very good to find the Z3 site. I hope we can trust it. :)

I think you must specify 'less than' as the bit vector comparator rather than 'equals'.

This is my first try at using this language, but here's my translation of the shift-left case:
http://rise4fun.com/Z3/wqUA

Just in case that link disappears:
(declare-fun X () (_ BitVec 3))
(declare-fun Shift () (_ BitVec 3))
(declare-fun Mask () (_ BitVec 3))
(declare-fun Y () (_ BitVec 3))

; ShiftOpcode == Instruction::Shl
(assert (not (=
(ite (= (bvshl (bvlshr Y Shift) Shift) Y)
; ((X << Shift) & Mask) < Y
(bvslt (bvand (bvshl X Shift) Mask) Y)
true)

(ite (= (bvshl (bvlshr Y Shift) Shift) Y)
; (X & (Mask >> Shift)) < (Y >> Shift)
(bvslt (bvand X (bvlshr Mask Shift)) (bvlshr Y Shift))
true)
)))
(check-sat)
(get-model)

@nunoplopes
Copy link
Member

Thanks for your review!
Now I see that you were refering to the signed comparisons.

I did some tests for signed < ("(X << C1) & C2 s< C3") and the results are:

  1. If C2 and C3 don't have the sign bit, then the transformation is correct
  2. Assuming that left shift overflow is undefined isn't sufficient to prove correctness
  3. Doing the transformation when the shift is NSW is not correct. Counterexample (4 bits):

X = #x0
C3 = #x8
C2 = #x0
C1 = #x3

So it seems that the correct fix is 1) and not 3).

Feel free to play with the tests: http://rise4fun.com/Z3/plN5x

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 26, 2013

Here's my implementation of the 3 left-shift cases mentioned so far:

  1. Plain left-shift with signed comparison (this is the original C test case condition, so we know this is illegal)

  2. nsw left-shift with signed comparison

  3. left-shift with unsigned comparison

Note that I've reduced the data types to 2-bit vectors because I'm not sure how to code 'shl nsw' in LISP otherwise. :)

http://rise4fun.com/Z3/1Uwh

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 26, 2013

If I've coded it correctly, Z3 proves that case #​2 (nsw) is not a safe optimization. This is the same conclusion that Nuno just reached.

@nunoplopes
Copy link
Member

If I've coded it correctly, Z3 proves that case #​2 (nsw) is not a safe
optimization. This is the same conclusion that Nuno just reached.

Cool!
I'm pretty happy that you managed to work with Z3 :)

In summary, the conclusion is that both the current code and your patch are not correct.
My proposal is that we should only do the transformation when C2 and C3 don't have the sign bit set (if doing a signed comparison).

Feel free to propose a new patch. I'll be glad to review it.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 28, 2013

Thanks, Nuno. I'm having fun with Z3. :)

I decided to code up the full set (30!) of comparisons and shifts (because that's what the code claims to handle) with the only constraint that no bits are lost when shifting the comparison value:

http://rise4fun.com/Z3/PHzD

From that result, I think we see that if either the shift is signed (arithmetic shift right) or the comparison is signed, then the transformation is SAT (ie, illegal).

I didn't add the sign bit constraint for C2 and C3 that you found yet. How did you determine that constraint? I guess the right thing to do is check various permutations of that kind of constraint until all tests pass?

@nunoplopes
Copy link
Member

Thanks, Nuno. I'm having fun with Z3. :)

I decided to code up the full set (30!) of comparisons and shifts (because
that's what the code claims to handle) with the only constraint that no bits
are lost when shifting the comparison value:

http://rise4fun.com/Z3/PHzD

Cool! :)

From that result, I think we see that if either the shift is signed
(arithmetic shift right) or the comparison is signed, then the
transformation is SAT (ie, illegal).

I didn't add the sign bit constraint for C2 and C3 that you found yet. How
did you determine that constraint? I guess the right thing to do is check
various permutations of that kind of constraint until all tests pass?

It was a guess. From past experience, it seemed like a good candidate. Then I used Z3 to prove that it is actually correct.
Of course, it doesn't mean it's the weakest precondition for the transformation.

My suggestion is that we first fix the current code, so that the fix goes into LLVM 3.4. Then we can generalize the patch if needed/possible.

BTW, we do have an algorithm to generate these conditions automatically. But we have only implemented on simple IR, and not yet for LLVM's IR.

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 28, 2013

Hi Nuno,
I agree that we should correct the bug first, and then relax the constraints as much as possible to allow more optimization.

To that end, here is the simplest patch that I can think of to correct the codegen for 3.4 - it just checks the comparison to make sure it is not signed in the case of the logical shifts. I think the existing constraint on the arithmetic shift right case is safe, but I have not proven it yet:

Index: lib/Transforms/InstCombine/InstCombineCompares.cpp

--- lib/Transforms/InstCombine/InstCombineCompares.cpp (revision 195371)
+++ lib/Transforms/InstCombine/InstCombineCompares.cpp (working copy)
@@ -1198,11 +1198,16 @@
Type *AndTy = AndCST->getType(); // Type of the and.

   // We can fold this as long as we can't shift unknown bits
  •  // into the mask.  This can only happen with signed shift
    
  •  // rights, as they sign-extend.
    
  •  // into the mask. This can happen with signed shift
    
  •  // rights, as they sign-extend. With logical shifts,
    
  •  // we must still make sure the comparison is not signed
    
  •  // because we are effectively changing the
    
  •  // position of the sign bit (#17827 ).
    
  •  // TODO: We can relax these constraints a bit more.
     if (ShAmt) {
    
  •    bool CanFold = Shift->isLogicalShift();
    
  •    if (!CanFold) {
    
  •    bool CanFold = false;
    
  •    unsigned ShiftOpcode = Shift->getOpcode();
    
  •    if (ShiftOpcode == Instruction::AShr) {
         // To test for the bad case of the signed shr, see if any
         // of the bits shifted in could be tested after the mask.
         uint32_t TyBits = Ty->getPrimitiveSizeInBits();
    

@@ -1212,6 +1217,9 @@
if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) &
AndCST->getValue()) == 0)
CanFold = true;

  •    } else if (ShiftOpcode == Instruction::Shl ||
    
  •               ShiftOpcode == Instruction::LShr) {
    
  •      CanFold = !ICI.isSigned();
       }
    
       if (CanFold) {
    

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 28, 2013

simplest patch to correct bug for 3.4 release
This patch adds the constraint that we should not alter the comparison for signed comparisons. It fixes the test cases in this bug report.

Some test cases included, but they are not thorough.

If anyone would like to review and commit this patch, I would be most appreciative. I will not be able to commit it myself for at least the next 3 days.

@nunoplopes
Copy link
Member

Created attachment 11626 [details]
simplest patch to correct bug for 3.4 release

This patch adds the constraint that we should not alter the comparison for
signed comparisons. It fixes the test cases in this bug report.

Some test cases included, but they are not thorough.

If anyone would like to review and commit this patch, I would be most
appreciative. I will not be able to commit it myself for at least the next 3
days.

The patch LGTM. Please commit it when you are available again. Thanks!

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 2, 2013

Thanks, Nuno. Committed as r196129.

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 2, 2013

Just checking the sign bits of C2 and C3 does not appear to be sufficient for a right-shift (either arithmetic or logical):

  1. u>> slt sat
    (model
    (define-fun X () (_ BitVec 8) #x80)
    (define-fun C3 () (_ BitVec 8) #x00)
    (define-fun C2 () (_ BitVec 8) #x60)
    (define-fun C1 () (_ BitVec 8) #x02) )

http://rise4fun.com/Z3/9tj4

@nunoplopes
Copy link
Member

Just checking the sign bits of C2 and C3 does not appear to be sufficient
for a right-shift (either arithmetic or logical):

  1. u>> slt sat
    (model
    (define-fun X () (_ BitVec 8) #x80)
    (define-fun C3 () (_ BitVec 8) #x00)
    (define-fun C2 () (_ BitVec 8) #x60)
    (define-fun C1 () (_ BitVec 8) #x02) )

http://rise4fun.com/Z3/9tj4

You're right.

2nd proposal, perform transformation only if:
(or
(bvslt C3 #x00)
(and
(bvsge (bvshl C3 C1) #x00)
(bvsge (bvshl C2 C1) #x00)
)
)

i.e., we don't shift a 1 into the sign bit, unless C3 was already negative.

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 4, 2013

The existing check to allow optimization on an arithmetic shift right is insufficient. It just checks if any bits that are shifted in overlap with the mask.

Here's a test case that proves it:

; Function Attrs: nounwind readnone ssp uwtable
define i1 @​foo(i8 %p) #​0 {
entry:
%shlp = ashr i8 %p, 1
%andp = and i8 %shlp, 64
%cmp = icmp slt i8 %andp, -63
ret i1 %cmp
}

; Function Attrs: nounwind readnone ssp uwtable
define i1 @​main() #​0 {
entry:
%call = tail call i1 @​foo(i8 128)
ret i1 %call
}

...

Or in Z3 form - http://rise4fun.com/Z3/5FiO :
15. s>> slt
sat (model
(define-fun X () (_ BitVec 8) #x80)
(define-fun C3 () (_ BitVec 8) #xc1)
(define-fun C2 () (_ BitVec 8) #x40)
(define-fun C1 () (_ BitVec 8) #x01)
)

...

However, it appears that there is no way to expose this bug in actual codegen because earlier optimizations in InstCombine such as SimplifyICmpInst() always optimize the IR before we get to the shift+and+cmp chunk of code.

@nunoplopes
Copy link
Member

Ok, so I give up trying to weak it by hand.

The only safe thing I can come up with is:
(and (bvsge (bvshl C3 C1) #x00)
(bvsge (bvshl C2 C1) #x00)
)

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 5, 2013

Ok, so I give up trying to weak it by hand.

The only safe thing I can come up with is:
(and (bvsge (bvshl C3 C1) #x00)
(bvsge (bvshl C2 C1) #x00)
)

Heh...yeah, I was experimenting by hand too, and I've been wondering if this is worth so much effort...we're just getting rid of 1 shift in some obscure cases now!

I'll try to code this up in C and propose a patch.

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 6, 2013

relax previous patch to allow more optimization and fix AShr
Here's a patch that (I hope) implements the constraints specified by Nuno for the left shift and logical right shift cases.

I have not found any reasonable constraint to make the arithmetic right shift case work, so I've just changed that to not fold. Based on my experiments, it is very likely that a previous optimization has simplified AShr into LShr or removed it entirely, so this should not be a significant loss.

Index: lib/Transforms/InstCombine/InstCombineCompares.cpp

--- lib/Transforms/InstCombine/InstCombineCompares.cpp (revision 196164)
+++ lib/Transforms/InstCombine/InstCombineCompares.cpp (working copy)
@@ -1195,33 +1195,42 @@
ConstantInt *ShAmt;
ShAmt = Shift ? dyn_cast(Shift->getOperand(1)) : 0;

  •  // We can fold this as long as we can't shift unknown bits
    
  •  // into the mask. This can happen with signed shift
    
  •  // rights, as they sign-extend. With logical shifts,
    
  •  // we must still make sure the comparison is not signed
    
  •  // because we are effectively changing the
    
  •  // position of the sign bit (#17827 ).
    
  •  // TODO: We can relax these constraints a bit more.
    
  •  // This seemingly simple opportunity to fold away a shift turns out to
    
  •  // be rather complicated. See llvm/llvm-project#18201  for details.
     if (ShAmt) {
       bool CanFold = false;
       unsigned ShiftOpcode = Shift->getOpcode();
       if (ShiftOpcode == Instruction::AShr) {
    
  •      // To test for the bad case of the signed shr, see if any
    
  •      // of the bits shifted in could be tested after the mask.
    
  •      Type *ShiftType = Shift->getType();
    
  •      Type *AndType = AndCst->getType();
    
  •      unsigned ShiftBitWidth = ShiftType->getPrimitiveSizeInBits();
    
  •      unsigned AndBitWidth = AndType->getPrimitiveSizeInBits();
    
  •      int ShAmtVal = ShiftBitWidth - ShAmt->getLimitedValue(ShiftBitWidth);
    
  •      if ((APInt::getHighBitsSet(AndBitWidth, AndBitWidth - ShAmtVal) &
    
  •           AndCst->getValue()) == 0)
    
  •      // There may be some constraints that make this possible, but it is
    
  •      // not obvious what those constraints are.
    
  •      CanFold = false;
    
  •    } else if (ShiftOpcode == Instruction::Shl) {
    
  •      // For a left shift, we can fold if the comparison is not signed.
    
  •      // We can also fold a signed comparison if the mask value and
    
  •      // comparison value are not negative. These constraints are not
    
  •      // obvious, but we can prove that they are correct using an SMT
    
  •      // solver such as "Z3" :
    
  •      // http://rise4fun.com/Z3/DyMp
    
  •      if (!ICI.isSigned() || (!AndCst->isNegative() && !RHS->isNegative()))
           CanFold = true;
    
  •    } else if (ShiftOpcode == Instruction::Shl ||
    
  •               ShiftOpcode == Instruction::LShr) {
    
  •      CanFold = !ICI.isSigned();
    
  •    } else if (ShiftOpcode == Instruction::LShr) {
    
  •      // For a logical right shift, we can fold if the comparison is not
    
  •      // signed. We can also fold a signed comparison if the shifted mask
    
  •      // value and the shifted comparison value are not negative.
    
  •      // These constraints are not obvious, but we can prove that they are
    
  •      // correct using an SMT solver such as "Z3" :
    
  •      // http://rise4fun.com/Z3/Tslfh
    
  •      if (!ICI.isSigned())
    
  •        CanFold = true;
    
  •      else {
    
  •        ConstantInt *ShiftedAndCst =
    
  •          cast<ConstantInt>(ConstantExpr::getShl(AndCst, ShAmt));
    
  •        ConstantInt *ShiftedRHSCst =
    
  •          cast<ConstantInt>(ConstantExpr::getShl(RHS, ShAmt));
    
  •        if (!ShiftedAndCst->isNegative() && !ShiftedRHSCst->isNegative())
    
  •          CanFold = true;
    
  •      }
       }
    
       if (CanFold) {
    

@nunoplopes
Copy link
Member

Sorry for the delay.
The patch LGTM; please commit.

I assume you encoded the current Ashr implementation and you found a bug, right?

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 16, 2013

Sorry for the delay.
The patch LGTM; please commit.

I assume you encoded the current Ashr implementation and you found a bug,
right?

Thanks, Nuno. Yes, I think the Ashr case is incorrect. Please see comment 35 for example in IR and Z3.

@nunoplopes
Copy link
Member

Sorry for the delay.
The patch LGTM; please commit.

I assume you encoded the current Ashr implementation and you found a bug,
right?

Thanks, Nuno. Yes, I think the Ashr case is incorrect. Please see comment 35
for example in IR and Z3.

Ok, please commit the patch and close this bug report. Thanks!

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 19, 2013

Patch committed in r197705.

Author: kkhoo
Date: Thu Dec 19 12:07:17 2013
New Revision: 197705

URL: http://llvm.org/viewvc/llvm-project?rev=197705&view=rev
Log:
Improved fix for #18201 (instcombine of shift/and/compare).

This change fixes the case of arithmetic shift right - do not attempt to fold that case.
This change also relaxes the conditions when attempting to fold the logical shift right and shift left cases.

No additional IR-level test cases included at this time. See http://llvm.org/bugs/show_bug.cgi?id=17827 for proofs that these are correct transformations.

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 19, 2013

Hal correctly points out that some LLVM customers may be on questionable legal ground by including links to Z3 in the LLVM codebase due to its terms of use and also that those links may not last, so I have removed the links in r197713.

Here are my code examples for the left shift and logical right shift cases that were referenced in those comments:

; We want to optimize an expression of the form:
; (X >> C1) & C2 != C3 (where any shift and any compare could exist)

; We need to check 10 forms of compares:
; signed less-than, signed less-than-or-equal, signed greater-than, signed greater-than-or-equal
; unsigned less-than, unsigned less-than-or-equal, unsigned greater-than, unsigned greater-than-or-equal
; equal, not-equal

(declare-fun X () (_ BitVec 8))
(declare-fun C1 () (_ BitVec 8)) ; shift value
(declare-fun C2 () (_ BitVec 8)) ; mask value
(declare-fun C3 () (_ BitVec 8)) ; comparison value

(push) ; LEFT-SHIFT
(assert (and
(bvsge C2 #x00)
(bvsge C3 #x00)
(= (bvshl (bvlshr C3 C1) C1) C3) ; must not lose any bits when shifting comparison value
; (bvult C1 8) ; restrict to defined behavior - shift value can not be greater than vector size
))
(push)
(echo "1. << ult")
(assert (not (=
; (X << C1) & C2 s< C3
(bvult (bvand (bvshl X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvult (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "2. << ule")
(assert (not (=
; (X << C1) & C2 s< C3
(bvule (bvand (bvshl X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvule (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "3. << ugt")
(assert (not (=
; (X << C1) & C2 s< C3
(bvugt (bvand (bvshl X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvugt (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "4. << uge")
(assert (not (=
; (X << C1) & C2 s< C3
(bvuge (bvand (bvshl X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvuge (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "5. << slt")
(assert (not (=
; (X << C1) & C2 s< C3
(bvslt (bvand (bvshl X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvslt (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "6. << sle")
(assert (not (=
; (X << C1) & C2 s< C3
(bvsle (bvand (bvshl X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvsle (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "7. << sgt")
(assert (not (=
; (X << C1) & C2 s< C3
(bvsgt (bvand (bvshl X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvsgt (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "8. << sge")
(assert (not (=
; (X << C1) & C2 s< C3
(bvsge (bvand (bvshl X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvsge (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "9. << eq")
(assert (not (=
; (X << C1) & C2 s< C3
(= (bvand (bvshl X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(= (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "10. << not eq")
(assert (not (=
; (X << C1) & C2 s< C3
(not (= (bvand (bvshl X C1) C2) C3))

; (X & (C2 >> C1)) s< (C3 >> C1)
(not (= (bvand X (bvlshr C2 C1)) (bvlshr C3 C1)))
)))
(check-sat)
;(get-model)
(pop)

(pop) ; left-shift

; We want to optimize an expression of the form:
; (X >> C1) & C2 != C3 (where any shift and any compare could exist)

; We also need to check 10 forms of compares:
; signed less-than, signed less-than-or-equal, signed greater-than, signed greater-than-or-equal
; unsigned less-than, unsigned less-than-or-equal, unsigned greater-than, unsigned greater-than-or-equal
; equal, not-equal

(declare-fun X () (_ BitVec 8))
(declare-fun C1 () (_ BitVec 8)) ; shift value
(declare-fun C2 () (_ BitVec 8)) ; mask value
(declare-fun C3 () (_ BitVec 8)) ; comparison value

(push) ; LOGICAL RIGHT SHIFT
(assert (and
(and (bvsge (bvshl C3 C1) #x00)
(bvsge (bvshl C2 C1) #x00)
)
;(bvsge C2 #x00)
;(bvsge C3 #x00)
(= (bvlshr (bvshl C3 C1) C1) C3) ; must not lose any bits when shifting comparison value
; (bvult C1 4) ; restrict to defined behavior - shift value can not be greater than vector size
))
(push)

(push)
(echo "21. u>> ult")
(assert (not (=
; (X u>> C1) & C2 s< C3
(bvult (bvand (bvlshr X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvult (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "22. u>> ule")
(assert (not (=
; (X u>> C1) & C2 s< C3
(bvule (bvand (bvlshr X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvule (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))

(check-sat)
;(get-model)
(pop)

(push)
(echo "23. u>> ugt")
(assert (not (=
; (X u>> C1) & C2 s< C3
(bvugt (bvand (bvlshr X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvugt (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "24. u>> uge")
(assert (not (=
; (X u>> C1) & C2 s< C3
(bvuge (bvand (bvlshr X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvuge (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "25. u>> slt")
(assert (not (=
; (X u>> C1) & C2 s< C3
(bvslt (bvand (bvlshr X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvslt (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "26. u>> sle")
(assert (not (=
; (X u>> C1) & C2 s< C3
(bvsle (bvand (bvlshr X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvsle (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))

(check-sat)
;(get-model)
(pop)

(push)
(echo "27. u>> sgt")
(assert (not (=
; (X u>> C1) & C2 s< C3
(bvsgt (bvand (bvlshr X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvsgt (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "28. u>> sge")
(assert (not (=
; (X u>> C1) & C2 s< C3
(bvsge (bvand (bvlshr X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(bvsge (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "29. u>> eq")
(assert (not (=
; (X u>> C1) & C2 s< C3
(= (bvand (bvlshr X C1) C2) C3)

; (X & (C2 >> C1)) s< (C3 >> C1)
(= (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
(pop)

(push)
(echo "30. u>> not eq")
(assert (not (=
; (X u>> C1) & C2 s< C3
(not (= (bvand (bvlshr X C1) C2) C3))

; (X & (C2 >> C1)) s< (C3 >> C1)
(not (= (bvand X (bvshl C2 C1)) (bvshl C3 C1)))
)))
(check-sat)
(pop)

(pop) ; right-shift

@llvmbot llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 9, 2021
This issue was closed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bugzilla Issues migrated from bugzilla
Projects
None yet
Development

No branches or pull requests

3 participants