20059 – wrong code (FP exception) at -O3 on x86_64-linux-gnu (bad instcombine)

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 20059 - wrong code (FP exception) at -O3 on x86_64-linux-gnu (bad instcombine)

Summary: wrong code (FP exception) at -O3 on x86_64-linux-gnu (bad instcombine)

Status:	RESOLVED FIXED

Alias:	None

Product:	clang
Classification:	Unclassified
Component:	-New Bugs (show other bugs)
Version:	trunk
Hardware:	PC All

Importance:	P normal
Assignee:	Unassigned Clang Bugs

URL:
Keywords:

Depends on:
Blocks:

Reported:	2014-06-16 20:58 PDT by Zhendong Su
Modified:	2018-03-01 08:55 PST (History)
CC List:	6 users (show)

See Also:	20358
Fixed By Commit(s):

Attachments
t3.ll testcase (3.80 KB, application/octet-stream) 2014-06-22 12:01 PDT, Arnold Schwaighofer	Details
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Zhendong Su 2014-06-16 20:58:06 PDT

The current clang trunk miscompiles the following code at -O3 on x86_64-linux-gnu (in both 32-bit and 64-bit modes). 

This is a regression from 3.4. 

$ clang-trunk -v
clang version 3.5.0 (trunk 211006)
Target: x86_64-unknown-linux-gnu
Thread model: posix
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6.4
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.7
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.7.3
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8.1
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
Candidate multilib: .;@m64
Candidate multilib: 32;@m32
Selected multilib: .;@m64
$ 
$ clang-trunk -O2 small.c; a.out
$ clang-3.4 -O3 small.c; a.out
$ 
$ clang-trunk -O3 small.c
$ a.out
Floating point exception (core dumped)
$ 


---------------------


int a, b, c, d = 1;
char e = 1;

void
fn1 ()
{
  int h;
  for (; e; e++)
    {
      for (; b;)
	d--;
      h = (d ? a % d : 0);
      c &= h;
    }
}

int
main ()
{
  fn1 ();
  return 0;
}

Comment 1 Sanjay Patel 2014-06-17 13:15:43 PDT

IR test case produced via:
$ ./clang 20059.c -O3 -mllvm -disable-llvm-optzns -S -emit-llvm
$ cat 20059.ll
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9.0"

@d = global i32 1, align 4
@e = global i8 1, align 1
@b = common global i32 0, align 4
@a = common global i32 0, align 4
@c = common global i32 0, align 4

; Function Attrs: nounwind ssp uwtable
define void @fn1() #0 {
entry:
  %h = alloca i32, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i8* @e, align 1, !tbaa !1
  %tobool = icmp ne i8 %0, 0
  br i1 %tobool, label %for.body, label %for.end5

for.body:                                         ; preds = %for.cond
  br label %for.cond1

for.cond1:                                        ; preds = %for.body3, %for.body
  %1 = load i32* @b, align 4, !tbaa !4
  %tobool2 = icmp ne i32 %1, 0
  br i1 %tobool2, label %for.body3, label %for.end

for.body3:                                        ; preds = %for.cond1
  %2 = load i32* @d, align 4, !tbaa !4
  %dec = add nsw i32 %2, -1
  store i32 %dec, i32* @d, align 4, !tbaa !4
  br label %for.cond1

for.end:                                          ; preds = %for.cond1
  %3 = load i32* @d, align 4, !tbaa !4
  %tobool4 = icmp ne i32 %3, 0
  br i1 %tobool4, label %cond.true, label %cond.false

cond.true:                                        ; preds = %for.end
  %4 = load i32* @a, align 4, !tbaa !4
  %5 = load i32* @d, align 4, !tbaa !4
  %rem = srem i32 %4, %5
  br label %cond.end

cond.false:                                       ; preds = %for.end
  br label %cond.end

cond.end:                                         ; preds = %cond.false, %cond.true
  %cond = phi i32 [ %rem, %cond.true ], [ 0, %cond.false ]
  store i32 %cond, i32* %h, align 4, !tbaa !4
  %6 = load i32* %h, align 4, !tbaa !4
  %7 = load i32* @c, align 4, !tbaa !4
  %and = and i32 %7, %6
  store i32 %and, i32* @c, align 4, !tbaa !4
  br label %for.inc

for.inc:                                          ; preds = %cond.end
  %8 = load i8* @e, align 1, !tbaa !1
  %inc = add i8 %8, 1
  store i8 %inc, i8* @e, align 1, !tbaa !1
  br label %for.cond

for.end5:                                         ; preds = %for.cond
  ret void
}

; Function Attrs: nounwind ssp uwtable
define i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  store i32 0, i32* %retval
  call void @fn1()
  ret i32 0
}

attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.5.0 (210980)"}
!1 = metadata !{metadata !2, metadata !2, i64 0}
!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
!3 = metadata !{metadata !"Simple C/C++ TBAA"}
!4 = metadata !{metadata !5, metadata !5, i64 0}
!5 = metadata !{metadata !"int", metadata !2, i64 0}

Comment 2 Sanjay Patel 2014-06-17 13:22:14 PDT

Looks like a bug in the loop vectorizer:

$ ./opt -tbaa -basicaa -inline -early-cse -instcombine -simplifycfg -loop-rotate -licm -loop-unswitch -gvn -sccp -instcombine -correlated-propagation -dse  20059.ll -S -o pre_vector.ll

$ ./opt -loop-vectorize pre_vector.ll -S
Instruction does not dominate all uses!
  %6 = zext i8 %0 to i32
  %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ %6, %for.cond1.preheader.lr.ph.split.us.i.split.us ]
LLVM ERROR: Broken function found, compilation aborted!

Comment 3 Arnold Schwaighofer 2014-06-17 13:35:47 PDT

The original code looks like it is a duplicate PR19823 (a bug in the runtime unroller similar to what we had in the vectorizer). What happens if this is compiled with -fno-unroll-loops?

Comment 4 Sanjay Patel 2014-06-17 14:15:04 PDT

Hi Arnold - thanks for your help. Was comment 3 intended for bug 20063?

I don't see any difference in this test case with -fno-unroll-loops:

$ ./clang -O3 20059.c && ./a.out
Floating point exception: 8
$ ./clang -O3 -fno-unroll-loops 20059.c && ./a.out
Floating point exception: 8

Comment 5 Arnold Schwaighofer 2014-06-22 12:00:52 PDT

(In reply to comment #2)
> Looks like a bug in the loop vectorizer:
> 
> $ ./opt -tbaa -basicaa -inline -early-cse -instcombine -simplifycfg
> -loop-rotate -licm -loop-unswitch -gvn -sccp -instcombine
> -correlated-propagation -dse  20059.ll -S -o pre_vector.ll
> 
> $ ./opt -loop-vectorize pre_vector.ll -S
> Instruction does not dominate all uses!
>   %6 = zext i8 %0 to i32
>   %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ %6,
> %for.cond1.preheader.lr.ph.split.us.i.split.us ]
> LLVM ERROR: Broken function found, compilation aborted!

Yes, this is a separate error that I introduced :( when I fixed the back edge overflow error. I have fixed this in ToT (r211460).

But, there is a separate error causing the bug reported here.

It is not an error in the loop vectorizer. If it dump the IR right after the vectorizer has vectorized the loop and llc that test case it works fine. However, instcombine creates an srem on a partially undef vector.

% rm -f a.out && bin/opt -basicaa -loop-vectorize -S < t3.ll | bin/llc - -o - > tmp.s && bin/clang tmp.s && ./a.out
vs.
% rm -f a.out && bin/opt -basicaa -loop-vectorize -instcombine -S < t3.ll | bin/llc - -o - > tmp.s && bin/clang tmp.s && ./a.out
Floating point exception: 8


The vectorized loop will be:

vector.ph:                                        ; preds = %overflow.checked
  %broadcast.splatinsert3 = insertelement <4 x i32> undef, i32 %1, i32 0
  %broadcast.splat4 = shufflevector <4 x i32> %broadcast.splatinsert3, <4 x i32> undef, <4 x i32> zeroinitializer // no undef
  %broadcast.splatinsert5 = insertelement <4 x i32> undef, i32 %.pr6.us.us.pre.i, i32 0
  %broadcast.splat6 = shufflevector <4 x i32> %broadcast.splatinsert5, <4 x i32> undef, <4 x i32> zeroinitializer
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ %4, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <4 x i32> [ %7, %vector.ph ], [ %10, %vector.body ]
  %8 = trunc i32 %index to i8
  %broadcast.splatinsert = insertelement <4 x i8> undef, i8 %8, i32 0
  %broadcast.splat = shufflevector <4 x i8> %broadcast.splatinsert, <4 x i8> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i8> %broadcast.splat, <i8 0, i8 1, i8 2, i8 3>
  %9 = srem <4 x i32> %broadcast.splat4, %broadcast.splat6
  %10 = and <4 x i32> %9, %vec.phi
  %11 = add <4 x i8> %induction, <i8 1, i8 1, i8 1, i8 1>
  %12 = icmp eq <4 x i8> %11, zeroinitializer
  %index.next = add i32 %index, 4
  %13 = icmp eq i32 %index.next, %end.idx.rnd.down
  br i1 %13, label %middle.block, label %vector.body, !llvm.loop !6

What is important is to note that the srem operation on the vector on vector body does *not* operate on undef values.

Now, if i ran instcombine over this IR it will wrongly combine this to:

vector.ph:                                        ; preds = %overflow.checked
  %broadcast.splatinsert3 = insertelement <4 x i32> undef, i32 %1, i32 0 // partial undef vector
  %broadcast.splatinsert5 = insertelement <4 x i32> undef, i32 %.pr6.us.us.pre.i, i32 0
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ %2, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <4 x i32> [ %7, %vector.ph ], [ %12, %vector.body ]
  %vec.phi2 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %13, %vector.body ]
  %8 = srem <4 x i32> %broadcast.splatinsert3, %broadcast.splatinsert5 // SREM on undef!
  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
  %10 = srem <4 x i32> %broadcast.splatinsert3, %broadcast.splatinsert5
  %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
  %12 = and <4 x i32> %9, %vec.phi
  %13 = and <4 x i32> %11, %vec.phi2
  %index.next = add i32 %index, 8
  %14 = icmp eq i32 %index.next, %end.idx.rnd.down

And we get a division on undefined!

Comment 6 Arnold Schwaighofer 2014-06-22 12:01:23 PDT

Created attachment 12692 [details]
t3.ll testcase

Comment 7 Sanjay Patel 2014-07-07 18:12:07 PDT

In InstructionCombining.cpp -> InstCombiner::SimplifyVectorOp():

  // If both arguments of binary operation are shuffles, which use the same
  // mask and shuffle within a single vector, it is worthwhile to move the
  // shuffle after binary operation:
  //   Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m)

So this:
@a = global i32 1, align 4
@b = global i32 1, align 4

define i32 @main() #0 {
entry:
  %a1 = load i32* @a, align 4
  %b1 = load i32* @b, align 4
  %insert1 = insertelement <4 x i32> undef, i32 %a1, i32 0
  %insert2 = insertelement <4 x i32> undef, i32 %b1, i32 0
  %splat1 = shufflevector <4 x i32> %insert1, <4 x i32> undef, <4 x i32> zeroinitializer
  %splat2 = shufflevector <4 x i32> %insert2, <4 x i32> undef, <4 x i32> zeroinitializer
  %bar = srem <4 x i32> %splat1, %splat2
  %ext1 = extractelement <4 x i32> %bar, i32 0
  %ext2 = extractelement <4 x i32> %bar, i32 1
  store i32 %ext1, i32* @a, align 4
  store i32 %ext2, i32* @b, align 4
  ret i32 0
}

-------------------------------------------

Becomes this:
@a = global i32 1, align 4
@b = global i32 1, align 4

define i32 @main() {
entry:
  %a1 = load i32* @a, align 4
  %b1 = load i32* @b, align 4
  %insert1 = insertelement <4 x i32> undef, i32 %a1, i32 0
  %insert2 = insertelement <4 x i32> undef, i32 %b1, i32 0
!!! the splats are gone, but we're still doing a vector srem
  %0 = srem <4 x i32> %insert1, %insert2
  %ext1 = extractelement <4 x i32> %0, i32 0
  %ext2 = extractelement <4 x i32> %0, i32 0 <--- changed from element 1
  store i32 %ext1, i32* @a, align 4
  store i32 %ext2, i32* @b, align 4
  ret i32 0
}

--------------------------------------------

The x86-64 backend (any backend?) apparently ignores the vector srem op because it sees we're only using the 0 element, so this code doesn't fail even though the IR is wrong. 

But a more complex test case will show the div-by-0 error:
$ cat 20059.ll 
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.10.0"

@a = global i32 1, align 4

define i32 @main() #0 {
entry:
  %a1 = load i32* @a, align 4
  %insert0 = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 %a1, i32 0
  %insert1 = insertelement <4 x i32> undef, i32 %a1, i32 0
  %insert2 = insertelement <4 x i32> undef, i32 %a1, i32 0
  %splat1 = shufflevector <4 x i32> %insert1, <4 x i32> undef, <4 x i32> zeroinitializer
  %splat2 = shufflevector <4 x i32> %insert2, <4 x i32> undef, <4 x i32> zeroinitializer
  br label %body

body:                                      
  %index = phi i32 [ %a1, %entry ], [ %index.next, %body ]
  %vec.phi = phi <4 x i32> [ %insert0, %entry ], [ %foo1, %body ]
  %bar6 = srem <4 x i32> %splat1, %splat2
  %foo1 = and <4 x i32> %bar6, %vec.phi
  %index.next = add i32 %index, 1024
  %indexcmp = icmp eq i32 %index.next, 1
  br i1 %indexcmp, label %exit, label %body

exit:                                     
  %foo3 = extractelement <4 x i32> %foo1, i32 0
  store i32 %foo3, i32* @a, align 4
  ret i32 0
}

$ ./opt 20059.ll -instcombine | ./llc | ./clang -x assembler - -o a.out && ./a.out 
Floating point exception: 8

Comment 8 Sanjay Patel 2014-07-07 18:35:18 PDT

Maybe this is another case of needing to guard against ops that can trap (bug 17073 and bug 16729)?

What about vector FP ops or vector compares that generate selection masks?

Comment 9 Sanjay Patel 2014-07-08 09:47:11 PDT

We can defeat post-optimization by splitting into 2 functions:

@a = global i32 1, align 4
@b = global i32 1, align 4

define <4 x i32> @foo(<4 x i32> %p1, <4 x i32> %p2) {
entry:
  %splat1 = shufflevector <4 x i32> %p1, <4 x i32> undef, <4 x i32> zeroinitializer
  %splat2 = shufflevector <4 x i32> %p2, <4 x i32> undef, <4 x i32> zeroinitializer
  %retval = srem <4 x i32> %splat1, %splat2
  ret <4 x i32> %retval
}

define i32 @main() {
  %a1 = load i32* @a, align 4
  %b1 = load i32* @b, align 4
  %insert1 = insertelement <4 x i32> zeroinitializer, i32 %a1, i32 0
  %insert2 = insertelement <4 x i32> zeroinitializer, i32 %b1, i32 0
  %retfoo = call <4 x i32> @foo(<4 x i32> %insert1, <4 x i32> %insert2)
  ret i32 0
}

$ ./opt srem.ll | ./llc | ./clang -x assembler - -o a.out && ./a.out 
$ ./opt srem.ll -instcombine | ./llc | ./clang -x assembler - -o a.out && ./a.out 
Floating point exception: 8

Comment 10 Sanjay Patel 2014-07-08 09:49:48 PDT

Even better - remove the globals and loads:

define <4 x i32> @foo(<4 x i32> %p1, <4 x i32> %p2) {
entry:
  %splat1 = shufflevector <4 x i32> %p1, <4 x i32> undef, <4 x i32> zeroinitializer
  %splat2 = shufflevector <4 x i32> %p2, <4 x i32> undef, <4 x i32> zeroinitializer 
  %retval = srem <4 x i32> %splat1, %splat2
  ret <4 x i32> %retval
}

define i32 @main() {
  %insert1 = insertelement <4 x i32> zeroinitializer, i32 1, i32 0
  %insert2 = insertelement <4 x i32> zeroinitializer, i32 1, i32 0
  %retfoo = call <4 x i32> @foo(<4 x i32> %insert1, <4 x i32> %insert2)
  ret i32 0
}

Comment 11 Sanjay Patel 2014-07-08 14:21:33 PDT

One line patch posted for review:
http://reviews.llvm.org/D4424

I've convinced myself that integer comparisons will be ok with this transform, but I don't think we should be doing this transform for any vector FP ops. Performing any kind of op on unknown FP operands could cause an exception.

If that is correct, then I can submit another one line patch to disallow FP vectors and this test case in test/Transforms/InstCombine/vec_shuffle.ll should be removed:

define <4 x float> @shuffle_17fsub(<4 x float> %v1, <4 x float> %v2) nounwind uwtable {
; CHECK-LABEL: @shuffle_17fsub(
; CHECK-NOT: shufflevector
; CHECK: fsub <4 x float> %v1, %v2
; CHECK: shufflevector
  %t1 = shufflevector <4 x float> %v1, <4 x float> zeroinitializer,
                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
  %t2 = shufflevector <4 x float> %v2, <4 x float> zeroinitializer,
                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
  %r = fsub <4 x float> %t1, %t2
  ret <4 x float> %r
}

Comment 12 Sanjay Patel 2014-07-09 11:50:06 PDT

Fixed with:
http://llvm.org/viewvc/llvm-project?view=revision&revision=212629

The FP question is still open. I'll try to make a test case for that when I get a chance.