17827 – [instcombine comparison] wrong codegen at -O1 (affecting clang trunk, 3.3, and 3.2)

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 17827 - [instcombine comparison] wrong codegen at -O1 (affecting clang trunk, 3.3, and 3.2)

Summary: [instcombine comparison] wrong codegen at -O1 (affecting clang trunk, 3.3, an...

Status:	RESOLVED FIXED

Alias:	None

Product:	libraries
Classification:	Unclassified
Component:	Scalar Optimizations (show other bugs)
Version:	trunk
Hardware:	PC All

Importance:	P normal
Assignee:	Unassigned LLVM Bugs

URL:
Keywords:

Depends on:
Blocks:

Reported:	2013-11-05 20:28 PST by Zhendong Su
Modified:	2013-12-19 12:39 PST (History)
CC List:	7 users (show)

See Also:	17078 18101
Fixed By Commit(s):

Attachments
simplest patch to correct bug for 3.4 release (4.09 KB, application/octet-stream) 2013-11-28 13:06 PST, Kay Tiong Khoo	Details
relax previous patch to allow more optimization and fix AShr (3.38 KB, patch) 2013-12-05 16:44 PST, Kay Tiong Khoo	Details
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Zhendong Su 2013-11-05 20:28:51 PST

The following code is miscompiled by the current clang trunk (as well as clang 3.2 and 3.3) on x86_64-linux-gnu at -O1 and above in both 32-bit and 64-bit modes.  

It is a regression from clang 3.1. 

$ clang-trunk -v
clang version 3.4 (trunk 194075)
Target: x86_64-pc-linux-gnu
Thread model: posix
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.4
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.4.6
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.4.7
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6.3
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6
$ 
$ clang-trunk -O0 small.c; a.out
1
$ clang-3.1 -O1 small.c; a.out
1
$         
$ clang-trunk -O1 small.c; a.out
0
$ clang-3.3 -O1 small.c; a.out
0
$ clang-3.2 -O1 small.c; a.out
0
$ 
$ gcc-trunk -O1 small.c; a.out
1
$ icc -O1 small.c; a.out
1
$ 


------------------------------------


int printf (const char *, ...);

struct S
{
  int f0;
  int:1;
  int f1:8;
  int f2;
} a;

int b;

void
foo (int p)
{
  struct S c = a;
  c.f1 = p & 190;
  b = c.f1 < 1;
}

int
main ()
{
  foo (174);
  printf ("%d\n", b);
  return 0;
}

Comment 1 Kay Tiong Khoo 2013-11-14 17:17:48 PST

Simplified away extra struct fields, printf, global:

$ cat bitfield.c
struct S
{
  int f0:8;
  int f1:8;
} a;

int foo (int p)
{
  struct S c = a;
  c.f1 = p & 130;
  return c.f1 < 1;
}

int main ()
{
  return foo (128);
}

This still fails with -O1.

Comment 2 Kay Tiong Khoo 2013-11-14 17:20:03 PST

Unoptimized IR created by:
$ clang -O0 -fomit-frame-pointer bitfield.c -S -emit-llvm

; ModuleID = 'bitfield.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"

%struct.S = type { [2 x i8], [2 x i8] }

@a = common global %struct.S zeroinitializer, align 4

; Function Attrs: nounwind ssp uwtable
define i32 @foo(i32 %p) #0 {
entry:
  %p.addr = alloca i32, align 4
  %c = alloca %struct.S, align 4
  store i32 %p, i32* %p.addr, align 4
  %0 = bitcast %struct.S* %c to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.S* @a, i32 0, i32 0, i32 0), i64 4, i32 4, i1 false)
  %1 = load i32* %p.addr, align 4
  %and = and i32 %1, 130
  %2 = bitcast %struct.S* %c to i16*
  %3 = trunc i32 %and to i16
  %bf.load = load i16* %2, align 4
  %bf.value = and i16 %3, 255
  %bf.shl = shl i16 %bf.value, 8
  %bf.clear = and i16 %bf.load, 255
  %bf.set = or i16 %bf.clear, %bf.shl
  store i16 %bf.set, i16* %2, align 4
  %bf.result.shl = shl i16 %bf.value, 8
  %bf.result.ashr = ashr i16 %bf.result.shl, 8
  %bf.result.cast = sext i16 %bf.result.ashr to i32
  %4 = bitcast %struct.S* %c to i16*
  %bf.load1 = load i16* %4, align 4
  %bf.ashr = ashr i16 %bf.load1, 8
  %bf.cast = sext i16 %bf.ashr to i32
  %cmp = icmp slt i32 %bf.cast, 1
  %conv = zext i1 %cmp to i32
  ret i32 %conv
}

; Function Attrs: nounwind
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1

; Function Attrs: nounwind ssp uwtable
define i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  store i32 0, i32* %retval
  %call = call i32 @foo(i32 128)
  ret i32 %call
}

attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.4 (trunk 194436)"}

Comment 3 Kay Tiong Khoo 2013-11-14 17:23:57 PST

Using opt with -O1 on the unoptimized IR:
$ ./opt -O1 bitfield.ll -S -o optbit.ll

Results in:

define i32 @foo(i32 %p) #0 {
entry:
  %bf.shl = and i32 %p, 130
  %cmp = icmp eq i32 %bf.shl, 0
  %conv = zext i1 %cmp to i32
  ret i32 %conv
}

Something has changed the comparison of '< 1' into a comparison against '0'.

Comment 4 Kay Tiong Khoo 2013-11-14 17:25:22 PST

$ ./opt -O1 -debug-pass=Arguments -S < /dev/null
Pass Arguments:  -simplifycfg -domtree -domfrontier -mem2reg -instcombine
Pass Arguments:  -globalopt -ipsccp -deadargelim -instcombine -simplifycfg -basiccg -prune-eh -inline -functionattrs -domtree -domfrontier -scalarrepl -simplify-libcalls -instcombine -lazy-value-info -jump-threading -simplifycfg -instcombine -tailcallelim -simplifycfg -reassociate -domtree -loops -loopsimplify -lcssa -loop-rotate -licm -lcssa -loop-unswitch -instcombine -scalar-evolution -loopsimplify -lcssa -iv-users -indvars -loop-deletion -instcombine -memdep -memcpyopt -sccp -instcombine -lazy-value-info -jump-threading -correlated-propagation -domtree -memdep -dse -adce -simplifycfg -strip-dead-prototypes -print-used-types -deadtypeelim -preverify -domtree -verify -print-module

So it must be one of those passes...

Comment 5 Kay Tiong Khoo 2013-11-15 14:06:57 PST

Even more simplified test case source (doesn't need more than one bitfield, does appear to need the global and a mask constant that doesn't get optimized to something else):

$ cat bitfield.c
struct S
{
  int f0:3;
} a;

int foo (int p)
{
  struct S c = a;
  c.f0 = p & 6;
  return c.f0 < 1;
}

int main ()
{
  return foo (4);
}

Comment 6 Kay Tiong Khoo 2013-11-15 14:09:01 PST

More simplified source as unoptimized IR:

$ clang bitfield.c -S -emit-llvm

$ cat bitfield.ll
; ModuleID = 'bitfield.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"

%struct.S = type { i8, [3 x i8] }

@a = common global %struct.S zeroinitializer, align 4

; Function Attrs: nounwind ssp uwtable
define i32 @foo(i32 %p) #0 {
entry:
  %p.addr = alloca i32, align 4
  %c = alloca %struct.S, align 4
  store i32 %p, i32* %p.addr, align 4
  %0 = bitcast %struct.S* %c to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.S* @a, i32 0, i32 0), i64 4, i32 4, i1 false)
  %1 = load i32* %p.addr, align 4
  %and = and i32 %1, 6
  %2 = bitcast %struct.S* %c to i8*
  %3 = trunc i32 %and to i8
  %bf.load = load i8* %2, align 4
  %bf.value = and i8 %3, 7
  %bf.clear = and i8 %bf.load, -8
  %bf.set = or i8 %bf.clear, %bf.value
  store i8 %bf.set, i8* %2, align 4
  %bf.result.shl = shl i8 %bf.value, 5
  %bf.result.ashr = ashr i8 %bf.result.shl, 5
  %bf.result.cast = sext i8 %bf.result.ashr to i32
  %4 = bitcast %struct.S* %c to i8*
  %bf.load1 = load i8* %4, align 4
  %bf.shl = shl i8 %bf.load1, 5
  %bf.ashr = ashr i8 %bf.shl, 5
  %bf.cast = sext i8 %bf.ashr to i32
  %cmp = icmp slt i32 %bf.cast, 1
  %conv = zext i1 %cmp to i32
  ret i32 %conv
}

; Function Attrs: nounwind
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1

; Function Attrs: nounwind ssp uwtable
define i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  store i32 0, i32* %retval
  %call = call i32 @foo(i32 4)
  ret i32 %call
}

attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.4 (trunk 194436)"}

Comment 7 Kay Tiong Khoo 2013-11-15 14:11:28 PST

And bug repros with just 2 optimization passes:

$ opt -sroa -instcombine  bitfield.ll -S -o bitfieldbug.ll
$ clang bitfieldbug.ll ; ./a.out ; echo $?
0

Optimized IR in this case is:

$ cat bitfieldbug.ll 
...
define i32 @foo(i32 %p) #0 {
entry:
  %bf.shl = and i32 %p, 6
  %cmp = icmp eq i32 %bf.shl, 0
  %conv = zext i1 %cmp to i32
  ret i32 %conv
}
...

Comment 8 Kay Tiong Khoo 2013-11-15 14:50:37 PST

-debug output shows:
INSTCOMBINE ITERATION #1 on foo
...
IC: Old =   %cmp = icmp slt i8 %bf.shl, 1
    New =   <badref> = icmp eq i8 %bf.shl, 0

So the comparison of "<1" is being mistakenly transformed into "==0" here: a signed value is being mistakenly treated as an unsigned value.

Comment 9 Kay Tiong Khoo 2013-11-15 17:16:14 PST

Hmmm...given the IR, I think the instcombine optimizer is actually doing the right thing here. 

It recognizes that for an *8* bit signed value (field f0), "p & 6" can never be negative, so for the comparison to less than 1, the only remaining value is 0.

So now the question is:
Why is this struct with a 3-bit bitfield:
struct S
{
  int f0:3;
} a;

Defined like this:
%struct.S = type { i8, [3 x i8] }

Unless I'm missing something, the IR has lost a fundamental piece of information from the original source: there's no way to know that the 3rd bit of f0 is the sign bit now.

Comment 10 Kay Tiong Khoo 2013-11-17 10:43:46 PST

I started a thread on llvm-dev to discuss this:
http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-November/067814.html

The original test case (and my simplified test cases) could be resolved as invalid because they do not specify signed/unsigned for the bitfield(s). In that case (and I think this is crazy), the C standard specifies that signedness is implementation defined and C++ has just recently fixed this:

http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#739

Thank you to Mark Lacey and Richard Smith for pointing this out.

However, there is still a bug in llvm because its implementation behaves like we expected - plain 'int' is treated like 'signed int'.

Henrique Santos noted that the unoptimized IR does preserve the sign bit with all of those shift left/right ops. So there is a bug in InstCombine.

I'm still not sure what advantage the front-end / clang has in treating bit fields as 8-bit operands ('i8') rather than just using the actual number of bits, eg 'i3'. All that shifting can lead to other codegen problems as seen in bug 17956.

Comment 11 Kay Tiong Khoo 2013-11-19 17:40:30 PST

This is the IR after sroa and heading into instcombine:

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"

%struct.S = type { i8, [3 x i8] }

@a = common global %struct.S zeroinitializer, align 4

define i32 @foo(i32 %p) #0 {
entry:
  %c.sroa.4 = alloca [3 x i8]
  %c.sroa.0.0.copyload = load i8* getelementptr inbounds (%struct.S* @a, i64 0, i32 0), align 4
  %c.sroa.4.0.idx = getelementptr inbounds [3 x i8]* %c.sroa.4, i64 0, i64 0
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %c.sroa.4.0.idx, i8* getelementptr inbounds (%struct.S* @a, i64 0, i32 1, i64 0), i64 3, i32 1, i1 false)
  %and = and i32 %p, 6
  %0 = trunc i32 %and to i8
  %bf.value = and i8 %0, 7
  %bf.clear = and i8 %c.sroa.0.0.copyload, -8
  %bf.set = or i8 %bf.clear, %bf.value
  %bf.result.shl = shl i8 %bf.value, 5
  %bf.result.ashr = ashr i8 %bf.result.shl, 5
  %bf.shl = shl i8 %bf.set, 5
  %bf.ashr = ashr i8 %bf.shl, 5
  %bf.cast = sext i8 %bf.ashr to i32
  %cmp = icmp slt i32 %bf.cast, 1
  %conv = zext i1 %cmp to i32
  ret i32 %conv
}

; Function Attrs: nounwind
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1

Comment 12 Kay Tiong Khoo 2013-11-19 20:37:46 PST

Further simplified IR that still fails in instcombine (don't need any globals/structs):

define i1 @foo(i8 %p, i8 %q) #0 {
entry:
  %andp = and i8 %p, 6
  %andq = and i8 %q, 8
  %or = or i8 %andq, %andp
  %shl = shl i8 %or, 5
  %ashr = ashr i8 %shl, 5
  %cmp = icmp slt i8 %ashr, 1
  ret i1 %cmp
}

Note: the shifts appear to be impossible to generate with straight C code that just uses 'char' because of C's integer promotion rules.

Comment 13 Kay Tiong Khoo 2013-11-20 15:53:59 PST

Finally, based on the debug output, here's the most simplified code before the incorrect optimization takes place:

define i1 @foo(i8 %p) #0 {
entry:
  %shlp = shl i8 %p, 5
  %andp = and i8 %shlp, -64
  %cmp = icmp slt i8 %andp, 32
  ret i1 %cmp
}

Comment 14 Kay Tiong Khoo 2013-11-20 16:06:21 PST

Henrique suggested that this particular optimization:
((x shl 5) and -64) slt 32 -> (x and -64) slt 1

or perhaps more easily readable by humans:
((x mul 32) and 0xC0) slt 32 -> (x and 0xC0) slt 1

should only be done if the shift instruction has an 'nsw' flag or if the comparison was unsigned (ult).

Adding those constraints in InstCombiner::visitICmpInstWithInstAndIntCst() via:

        bool CanFold = Shift->isLogicalShift()
        && (Shift->hasNoSignedWrap() || !ICI.isSigned())
        ;

Causes 6 existing test failures.

This one file dealing with comparison optimizations is over 3K lines of code that only a compiler person could love...there will be bugs.

Comment 15 Kay Tiong Khoo 2013-11-20 16:10:56 PST

(In reply to comment #14)
> Henrique suggested that this particular optimization:
> ((x shl 5) and -64) slt 32 -> (x and -64) slt 1
> 
> or perhaps more easily readable by humans:
> ((x mul 32) and 0xC0) slt 32 -> (x and 0xC0) slt 1

Forgot to shift the mask value:
((x mul 32) and 0xC0) slt 32 -> (x and 0x06) slt 1

Comment 16 Kay Tiong Khoo 2013-11-20 17:04:56 PST

For reference, here's what broke with that one line change:

$ llvm-lit ../llvm/test/Transforms/InstCombine/
...
Failing Tests (6):
    LLVM :: Transforms/InstCombine/2006-09-15-CastToBool.ll
    LLVM :: Transforms/InstCombine/2007-03-25-BadShiftMask.ll
    LLVM :: Transforms/InstCombine/apint-shift.ll
    LLVM :: Transforms/InstCombine/icmp.ll
    LLVM :: Transforms/InstCombine/set.ll
    LLVM :: Transforms/InstCombine/shift.ll

Comment 17 Kay Tiong Khoo 2013-11-20 17:18:15 PST

    LLVM :: Transforms/InstCombine/2006-09-15-CastToBool.ll

This is based on bug 913.

    LLVM :: Transforms/InstCombine/2007-03-25-BadShiftMask.ll

This is based on bug 1271.

Comment 18 Kay Tiong Khoo 2013-11-20 20:03:59 PST

Phew...all of the failures were due to the fact that (strangely) you can't ask a logical right shift if it hasNoSignedWrap()...you'd think a BinaryOperator that isn't an OverflowingBinaryOperator would just return false for that, but no, it asserts on the cast.

By checking for left shift before asking about the signed wrap, all tests are passing again. 

It is frightening how little logic some of these tests actually confirm and how ad-hoc the testing is...

Comment 19 Kay Tiong Khoo 2013-11-21 13:44:51 PST

Patch submitted for review.

Comment 20 Kay Tiong Khoo 2013-11-25 12:14:16 PST

Evan Cheng looked at the patch and this bug report (thanks!) and suggested:
"Perhaps you can use Nuno's technique to verify the optimization with a SMT solver?"

This is the first I've heard of this work - I found:
http://web.ist.utl.pt/nuno.lopes/pubs/cork-spin13.pdf

And I see there was a presentation at an LLVM Dev Mtg:
http://llvm.org/devmtg/2013-11/#talk5

I haven't located any slides from that talk though.

Is there any testing machinery already in LLVM that I can do this with?

cc'ing Evan (I missed your reply on the commits list until now - too much traffic!) and Nuno.

Comment 21 Nuno Lopes 2013-11-25 13:50:30 PST

(In reply to comment #20)
> Evan Cheng looked at the patch and this bug report (thanks!) and suggested:
> "Perhaps you can use Nuno's technique to verify the optimization with a SMT
> solver?"

Please ignore that paper on CORK for now, since it's theorical work.
The slides from my tal can be found here: http://web.ist.utl.pt/~nuno.lopes/pres/llvm-devconf13.pptx

Currently there's no machinery, so the translation to SMT has to be done by hand. I did a simplified version of the optimization for Shl and LshR, which are the cases you are tyring to change.

I couldn't find any bug in the mentioned instcombine transformation. You can find the proof here:
http://rise4fun.com/Z3/IGjx

Unless I've missed some detail of the code, Z3 says it is correct and no patch is needed!

Comment 22 Kay Tiong Khoo 2013-11-26 11:09:52 PST

Thanks, Nuno! That is very good to find the Z3 site. I hope we can trust it. :)

I think you must specify 'less than' as the bit vector comparator rather than 'equals'. 

This is my first try at using this language, but here's my translation of the shift-left case:
http://rise4fun.com/Z3/wqUA

Just in case that link disappears:
(declare-fun X () (_ BitVec 3))
(declare-fun Shift () (_ BitVec 3))
(declare-fun Mask () (_ BitVec 3))
(declare-fun Y () (_ BitVec 3))

; ShiftOpcode == Instruction::Shl
(assert (not (=
  (ite (= (bvshl (bvlshr Y Shift) Shift) Y)
    ; ((X << Shift) & Mask) < Y
    (bvslt (bvand (bvshl X Shift) Mask) Y)
  true)
    
  (ite (= (bvshl (bvlshr Y Shift) Shift) Y)
    ; (X & (Mask >> Shift)) < (Y >> Shift)
    (bvslt (bvand X (bvlshr Mask Shift)) (bvlshr Y Shift))
  true)
)))
(check-sat)
(get-model)

Comment 23 Nuno Lopes 2013-11-26 12:44:06 PST

Thanks for your review!
Now I see that you were refering to the signed comparisons.

I did some tests for signed < ("(X << C1) & C2 s< C3") and the results are:
 1) If C2 and C3 don't have the sign bit, then the transformation is correct
 2) Assuming that left shift overflow is undefined isn't sufficient to prove correctness
 3) Doing the transformation when the shift is NSW is *not* correct. Counterexample (4 bits):

X = #x0
C3 = #x8
C2 = #x0
C1 = #x3

So it seems that the correct fix is 1) and not 3).

Feel free to play with the tests: http://rise4fun.com/Z3/plN5x

Comment 24 Kay Tiong Khoo 2013-11-26 13:36:55 PST

Here's my implementation of the 3 left-shift cases mentioned so far:
1. Plain left-shift with signed comparison (this is the original C test case condition, so we know this is illegal)

2. nsw left-shift with signed comparison

3. left-shift with unsigned comparison

Note that I've reduced the data types to 2-bit vectors because I'm not sure how to code 'shl nsw' in LISP otherwise. :)

http://rise4fun.com/Z3/1Uwh

Comment 25 Kay Tiong Khoo 2013-11-26 13:38:47 PST

If I've coded it correctly, Z3 proves that case #2 (nsw) is not a safe optimization. This is the same conclusion that Nuno just reached.

Comment 26 Nuno Lopes 2013-11-26 16:05:12 PST

(In reply to comment #25)
> If I've coded it correctly, Z3 proves that case #2 (nsw) is not a safe
> optimization. This is the same conclusion that Nuno just reached.

Cool!
I'm pretty happy that you managed to work with Z3 :)

In summary, the conclusion is that both the current code and your patch are not correct.
My proposal is that we should only do the transformation when C2 and C3 don't have the sign bit set (if doing a signed comparison).

Feel free to propose a new patch. I'll be glad to review it.

Comment 27 Kay Tiong Khoo 2013-11-27 19:31:20 PST

Thanks, Nuno. I'm having fun with Z3. :)

I decided to code up the full set (30!) of comparisons and shifts (because that's what the code claims to handle) with the only constraint that no bits are lost when shifting the comparison value:

http://rise4fun.com/Z3/PHzD

From that result, I think we see that if either the shift is signed (arithmetic shift right) or the comparison is signed, then the transformation is SAT (ie, illegal).

I didn't add the sign bit constraint for C2 and C3 that you found yet. How did you determine that constraint? I guess the right thing to do is check various permutations of that kind of constraint until all tests pass?

Comment 28 Nuno Lopes 2013-11-28 02:39:06 PST

(In reply to comment #27)
> Thanks, Nuno. I'm having fun with Z3. :)
> 
> I decided to code up the full set (30!) of comparisons and shifts (because
> that's what the code claims to handle) with the only constraint that no bits
> are lost when shifting the comparison value:
> 
> http://rise4fun.com/Z3/PHzD

Cool! :)


> From that result, I think we see that if either the shift is signed
> (arithmetic shift right) or the comparison is signed, then the
> transformation is SAT (ie, illegal).
> 
> I didn't add the sign bit constraint for C2 and C3 that you found yet. How
> did you determine that constraint? I guess the right thing to do is check
> various permutations of that kind of constraint until all tests pass?

It was a guess. From past experience, it seemed like a good candidate. Then I used Z3 to prove that it is actually correct.
Of course, it doesn't mean it's the weakest precondition for the transformation.

My suggestion is that we first fix the current code, so that the fix goes into LLVM 3.4.  Then we can generalize the patch if needed/possible.

BTW, we do have an algorithm to generate these conditions automatically. But we have only implemented on simple IR, and not yet for LLVM's IR.

Comment 29 Kay Tiong Khoo 2013-11-28 13:01:46 PST

Hi Nuno,
I agree that we should correct the bug first, and then relax the constraints as much as possible to allow more optimization. 

To that end, here is the simplest patch that I can think of to correct the codegen for 3.4 - it just checks the comparison to make sure it is not signed in the case of the logical shifts. I think the existing constraint on the arithmetic shift right case is safe, but I have not proven it yet:

Index: lib/Transforms/InstCombine/InstCombineCompares.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCompares.cpp  (revision 195371)
+++ lib/Transforms/InstCombine/InstCombineCompares.cpp  (working copy)
@@ -1198,11 +1198,16 @@
       Type *AndTy = AndCST->getType();          // Type of the and.

       // We can fold this as long as we can't shift unknown bits
-      // into the mask.  This can only happen with signed shift
-      // rights, as they sign-extend.
+      // into the mask. This can happen with signed shift
+      // rights, as they sign-extend. With logical shifts,
+      // we must still make sure the comparison is not signed
+      // because we are effectively changing the
+      // position of the sign bit (PR17827).
+      // TODO: We can relax these constraints a bit more.
       if (ShAmt) {
-        bool CanFold = Shift->isLogicalShift();
-        if (!CanFold) {
+        bool CanFold = false;
+        unsigned ShiftOpcode = Shift->getOpcode();
+        if (ShiftOpcode == Instruction::AShr) {
           // To test for the bad case of the signed shr, see if any
           // of the bits shifted in could be tested after the mask.
           uint32_t TyBits = Ty->getPrimitiveSizeInBits();
@@ -1212,6 +1217,9 @@
           if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) &
                AndCST->getValue()) == 0)
             CanFold = true;
+        } else if (ShiftOpcode == Instruction::Shl ||
+                   ShiftOpcode == Instruction::LShr) {
+          CanFold = !ICI.isSigned();
         }

         if (CanFold) {

Comment 30 Kay Tiong Khoo 2013-11-28 13:06:02 PST

Created attachment 11626 [details]
simplest patch to correct bug for 3.4 release

This patch adds the constraint that we should not alter the comparison for signed comparisons. It fixes the test cases in this bug report. 

Some test cases included, but they are not thorough.

If anyone would like to review and commit this patch, I would be most appreciative. I will not be able to commit it myself for at least the next 3 days.

Comment 31 Nuno Lopes 2013-11-28 16:09:54 PST

(In reply to comment #30)
> Created attachment 11626 [details]
> simplest patch to correct bug for 3.4 release
> 
> This patch adds the constraint that we should not alter the comparison for
> signed comparisons. It fixes the test cases in this bug report. 
> 
> Some test cases included, but they are not thorough.
> 
> If anyone would like to review and commit this patch, I would be most
> appreciative. I will not be able to commit it myself for at least the next 3
> days.

The patch LGTM. Please commit it when you are available again. Thanks!

Comment 32 Kay Tiong Khoo 2013-12-02 12:53:21 PST

Thanks, Nuno. Committed as r196129.

Comment 33 Kay Tiong Khoo 2013-12-02 15:07:21 PST

Just checking the sign bits of C2 and C3 does not appear to be sufficient for a right-shift (either arithmetic or logical):

25. u>> slt sat 
(model 
(define-fun X ()  (_ BitVec 8) #x80) 
(define-fun C3 () (_ BitVec 8) #x00) 
(define-fun C2 () (_ BitVec 8) #x60) 
(define-fun C1 () (_ BitVec 8) #x02) )

http://rise4fun.com/Z3/9tj4

Comment 34 Nuno Lopes 2013-12-03 16:08:43 PST

(In reply to comment #33)
> Just checking the sign bits of C2 and C3 does not appear to be sufficient
> for a right-shift (either arithmetic or logical):
> 
> 25. u>> slt sat 
> (model 
> (define-fun X ()  (_ BitVec 8) #x80) 
> (define-fun C3 () (_ BitVec 8) #x00) 
> (define-fun C2 () (_ BitVec 8) #x60) 
> (define-fun C1 () (_ BitVec 8) #x02) )
> 
> http://rise4fun.com/Z3/9tj4

You're right.

2nd proposal, perform transformation only if:
 (or
   (bvslt C3 #x00)
   (and
     (bvsge (bvshl C3 C1) #x00)
     (bvsge (bvshl C2 C1) #x00)
   )
 )

i.e., we don't shift a 1 into the sign bit, unless C3 was already negative.

Comment 35 Kay Tiong Khoo 2013-12-03 16:25:06 PST

The existing check to allow optimization on an arithmetic shift right is insufficient. It just checks if any bits that are shifted in overlap with the mask.

Here's a test case that proves it:

; Function Attrs: nounwind readnone ssp uwtable
define i1 @foo(i8 %p) #0 {
entry:
  %shlp = ashr i8 %p, 1
  %andp = and i8 %shlp, 64
  %cmp = icmp slt i8 %andp, -63
  ret i1 %cmp
}

; Function Attrs: nounwind readnone ssp uwtable
define i1 @main() #0 {
entry:
  %call = tail call i1 @foo(i8 128)
  ret i1 %call
}

...

Or in Z3 form - http://rise4fun.com/Z3/5FiO :
15. s>> slt 
sat (model 
(define-fun X () (_ BitVec 8) #x80) 
(define-fun C3 () (_ BitVec 8) #xc1) 
(define-fun C2 () (_ BitVec 8) #x40) 
(define-fun C1 () (_ BitVec 8) #x01)
)

...

However, it appears that there is no way to expose this bug in actual codegen because earlier optimizations in InstCombine such as SimplifyICmpInst() always optimize the IR before we get to the shift+and+cmp chunk of code.

Comment 36 Nuno Lopes 2013-12-04 16:26:47 PST

Ok, so I give up trying to weak it by hand.

The only safe thing I can come up with is:
(and (bvsge (bvshl C3 C1) #x00)
     (bvsge (bvshl C2 C1) #x00)
)

Comment 37 Kay Tiong Khoo 2013-12-04 16:47:40 PST

(In reply to comment #36)
> Ok, so I give up trying to weak it by hand.
> 
> The only safe thing I can come up with is:
> (and (bvsge (bvshl C3 C1) #x00)
>      (bvsge (bvshl C2 C1) #x00)
> )

Heh...yeah, I was experimenting by hand too, and I've been wondering if this is worth so much effort...we're just getting rid of 1 shift in some obscure cases now!

I'll try to code this up in C and propose a patch.

Comment 38 Kay Tiong Khoo 2013-12-05 16:44:15 PST

Created attachment 11675 [details]
relax previous patch to allow more optimization and fix AShr

Here's a patch that (I hope) implements the constraints specified by Nuno for the left shift and logical right shift cases. 

I have not found any reasonable constraint to make the arithmetic right shift case work, so I've just changed that to not fold. Based on my experiments, it is very likely that a previous optimization has simplified AShr into LShr or removed it entirely, so this should not be a significant loss.

Index: lib/Transforms/InstCombine/InstCombineCompares.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCompares.cpp	(revision 196164)
+++ lib/Transforms/InstCombine/InstCombineCompares.cpp	(working copy)
@@ -1195,33 +1195,42 @@
       ConstantInt *ShAmt;
       ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0;
 
-      // We can fold this as long as we can't shift unknown bits
-      // into the mask. This can happen with signed shift
-      // rights, as they sign-extend. With logical shifts,
-      // we must still make sure the comparison is not signed
-      // because we are effectively changing the
-      // position of the sign bit (PR17827).
-      // TODO: We can relax these constraints a bit more.
+      // This seemingly simple opportunity to fold away a shift turns out to
+      // be rather complicated. See PR17827 for details.
       if (ShAmt) {
         bool CanFold = false;
         unsigned ShiftOpcode = Shift->getOpcode();
         if (ShiftOpcode == Instruction::AShr) {
-          // To test for the bad case of the signed shr, see if any
-          // of the bits shifted in could be tested after the mask.
-          Type *ShiftType = Shift->getType();
-          Type *AndType = AndCst->getType();
- 
-          unsigned ShiftBitWidth = ShiftType->getPrimitiveSizeInBits();
-          unsigned AndBitWidth = AndType->getPrimitiveSizeInBits();
-
-          int ShAmtVal = ShiftBitWidth - ShAmt->getLimitedValue(ShiftBitWidth);
-
-          if ((APInt::getHighBitsSet(AndBitWidth, AndBitWidth - ShAmtVal) &
-               AndCst->getValue()) == 0)
+          // There may be some constraints that make this possible, but it is
+          // not obvious what those constraints are.
+          CanFold = false;
+        } else if (ShiftOpcode == Instruction::Shl) {
+          // For a left shift, we can fold if the comparison is not signed.
+          // We can also fold a signed comparison if the mask value and
+          // comparison value are not negative. These constraints are not
+          // obvious, but we can prove that they are correct using an SMT
+          // solver such as "Z3" :
+          // http://rise4fun.com/Z3/DyMp
+          if (!ICI.isSigned() || (!AndCst->isNegative() && !RHS->isNegative()))
             CanFold = true;
-        } else if (ShiftOpcode == Instruction::Shl ||
-                   ShiftOpcode == Instruction::LShr) {
-          CanFold = !ICI.isSigned();
+        } else if (ShiftOpcode == Instruction::LShr) {
+          // For a logical right shift, we can fold if the comparison is not
+          // signed. We can also fold a signed comparison if the shifted mask
+          // value and the shifted comparison value are not negative.
+          // These constraints are not obvious, but we can prove that they are
+          // correct using an SMT solver such as "Z3" :
+          // http://rise4fun.com/Z3/Tslfh
+          if (!ICI.isSigned())
+            CanFold = true;
+          else {
+            ConstantInt *ShiftedAndCst =
+              cast<ConstantInt>(ConstantExpr::getShl(AndCst, ShAmt));
+            ConstantInt *ShiftedRHSCst =
+              cast<ConstantInt>(ConstantExpr::getShl(RHS, ShAmt));
+            
+            if (!ShiftedAndCst->isNegative() && !ShiftedRHSCst->isNegative())
+              CanFold = true;
+          }
         }
 
         if (CanFold) {

Comment 39 Nuno Lopes 2013-12-15 16:11:07 PST

Sorry for the delay.
The patch LGTM; please commit.

I assume you encoded the current Ashr implementation and you found a bug, right?

Comment 40 Kay Tiong Khoo 2013-12-16 11:19:44 PST

(In reply to comment #39)
> Sorry for the delay.
> The patch LGTM; please commit.
> 
> I assume you encoded the current Ashr implementation and you found a bug,
> right?

Thanks, Nuno. Yes, I think the Ashr case is incorrect. Please see comment 35 for example in IR and Z3.

Comment 41 Nuno Lopes 2013-12-17 16:24:53 PST

(In reply to comment #40)
> (In reply to comment #39)
> > Sorry for the delay.
> > The patch LGTM; please commit.
> > 
> > I assume you encoded the current Ashr implementation and you found a bug,
> > right?
> 
> Thanks, Nuno. Yes, I think the Ashr case is incorrect. Please see comment 35
> for example in IR and Z3.

Ok, please commit the patch and close this bug report. Thanks!

Comment 42 Kay Tiong Khoo 2013-12-19 12:25:48 PST

Patch committed in r197705.

Author: kkhoo
Date: Thu Dec 19 12:07:17 2013
New Revision: 197705

URL: http://llvm.org/viewvc/llvm-project?rev=197705&view=rev
Log:
Improved fix for PR17827 (instcombine of shift/and/compare).

This change fixes the case of arithmetic shift right - do not attempt to fold that case.
This change also relaxes the conditions when attempting to fold the logical shift right and shift left cases.

No additional IR-level test cases included at this time. See http://llvm.org/bugs/show_bug.cgi?id=17827 for proofs that these are correct transformations.

Comment 43 Kay Tiong Khoo 2013-12-19 12:39:01 PST

Hal correctly points out that some LLVM customers may be on questionable legal ground by including links to Z3 in the LLVM codebase due to its terms of use and also that those links may not last, so I have removed the links in r197713. 

Here are my code examples for the left shift and logical right shift cases that were referenced in those comments:

; We want to optimize an expression of the form:
; (X >> C1) & C2 != C3 (where any shift and any compare could exist)

; We need to check 10 forms of compares:
; signed less-than, signed less-than-or-equal, signed greater-than, signed greater-than-or-equal
; unsigned less-than, unsigned less-than-or-equal, unsigned greater-than, unsigned greater-than-or-equal
; equal, not-equal

(declare-fun X () (_ BitVec 8))
(declare-fun C1 () (_ BitVec 8)) ; shift value
(declare-fun C2 () (_ BitVec 8)) ; mask value
(declare-fun C3 () (_ BitVec 8)) ; comparison value

(push) ; LEFT-SHIFT
(assert (and
  (bvsge C2 #x00) 
  (bvsge C3 #x00) 
  (= (bvshl (bvlshr C3 C1) C1) C3) ; must not lose any bits when shifting comparison value
 ; (bvult C1 8) ; restrict to defined behavior - shift value can not be greater than vector size
))
(push)
(echo "1. << ult")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (bvult (bvand (bvshl X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvult (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "2. << ule")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (bvule (bvand (bvshl X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvule (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "3. << ugt")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (bvugt (bvand (bvshl X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvugt (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "4. << uge")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (bvuge (bvand (bvshl X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvuge (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "5. << slt")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (bvslt (bvand (bvshl X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvslt (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "6. << sle")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (bvsle (bvand (bvshl X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvsle (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "7. << sgt")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (bvsgt (bvand (bvshl X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvsgt (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "8. << sge")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (bvsge (bvand (bvshl X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvsge (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)


(push)
(echo "9. << eq")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (= (bvand (bvshl X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (= (bvand X (bvlshr C2 C1)) (bvlshr C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "10. << not eq")
(assert (not (=
  ; (X << C1) & C2 s< C3
  (not (= (bvand (bvshl X C1) C2) C3))
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (not (= (bvand X (bvlshr C2 C1)) (bvlshr C3 C1)))
)))
(check-sat)
;(get-model)
(pop)

(pop) ; left-shift

; We want to optimize an expression of the form:
; (X >> C1) & C2 != C3 (where any shift and any compare could exist)

; We also need to check 10 forms of compares:
; signed less-than, signed less-than-or-equal, signed greater-than, signed greater-than-or-equal
; unsigned less-than, unsigned less-than-or-equal, unsigned greater-than, unsigned greater-than-or-equal
; equal, not-equal


(declare-fun X () (_ BitVec 8))
(declare-fun C1 () (_ BitVec 8)) ; shift value
(declare-fun C2 () (_ BitVec 8)) ; mask value
(declare-fun C3 () (_ BitVec 8)) ; comparison value


(push) ; LOGICAL RIGHT SHIFT
(assert (and
  (and (bvsge (bvshl C3 C1) #x00)
     (bvsge (bvshl C2 C1) #x00)
  )
 ;(bvsge C2 #x00) 
 ;(bvsge C3 #x00) 
 (= (bvlshr (bvshl C3 C1) C1) C3) ; must not lose any bits when shifting comparison value
 ; (bvult C1 4) ; restrict to defined behavior - shift value can not be greater than vector size
))
(push)

(push)
(echo "21. u>> ult")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (bvult (bvand (bvlshr X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvult (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "22. u>> ule")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (bvule (bvand (bvlshr X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvule (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))

(check-sat)
;(get-model)
(pop)

(push)
(echo "23. u>> ugt")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (bvugt (bvand (bvlshr X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvugt (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "24. u>> uge")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (bvuge (bvand (bvlshr X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvuge (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "25. u>> slt")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (bvslt (bvand (bvlshr X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvslt (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "26. u>> sle")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (bvsle (bvand (bvlshr X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvsle (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))

(check-sat)
;(get-model)
(pop)

(push)
(echo "27. u>> sgt")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (bvsgt (bvand (bvlshr X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvsgt (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "28. u>> sge")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (bvsge (bvand (bvlshr X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (bvsge (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
;(get-model)
(pop)

(push)
(echo "29. u>> eq")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (= (bvand (bvlshr X C1) C2) C3)
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (= (bvand X (bvshl C2 C1)) (bvshl C3 C1))
)))
(check-sat)
(pop)

(push)
(echo "30. u>> not eq")
(assert (not (=
  ; (X u>> C1) & C2 s< C3
  (not (= (bvand (bvlshr X C1) C2) C3))
  
  ; (X & (C2 >> C1)) s< (C3 >> C1)
  (not (= (bvand X (bvshl C2 C1)) (bvshl C3 C1)))
)))
(check-sat)
(pop)

(pop) ; right-shift