The current clang trunk miscompiles the following code at -O3 on x86_64-linux-gnu (in both 32-bit and 64-bit modes). This is a regression from 3.4. $ clang-trunk -v clang version 3.5.0 (trunk 211006) Target: x86_64-unknown-linux-gnu Thread model: posix Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6 Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.6.4 Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.7 Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.7.3 Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8 Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8.1 Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8 Candidate multilib: .;@m64 Candidate multilib: 32;@m32 Selected multilib: .;@m64 $ $ clang-trunk -O2 small.c; a.out $ clang-3.4 -O3 small.c; a.out $ $ clang-trunk -O3 small.c $ a.out Floating point exception (core dumped) $ --------------------- int a, b, c, d = 1; char e = 1; void fn1 () { int h; for (; e; e++) { for (; b;) d--; h = (d ? a % d : 0); c &= h; } } int main () { fn1 (); return 0; }
IR test case produced via: $ ./clang 20059.c -O3 -mllvm -disable-llvm-optzns -S -emit-llvm $ cat 20059.ll target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" @d = global i32 1, align 4 @e = global i8 1, align 1 @b = common global i32 0, align 4 @a = common global i32 0, align 4 @c = common global i32 0, align 4 ; Function Attrs: nounwind ssp uwtable define void @fn1() #0 { entry: %h = alloca i32, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i8* @e, align 1, !tbaa !1 %tobool = icmp ne i8 %0, 0 br i1 %tobool, label %for.body, label %for.end5 for.body: ; preds = %for.cond br label %for.cond1 for.cond1: ; preds = %for.body3, %for.body %1 = load i32* @b, align 4, !tbaa !4 %tobool2 = icmp ne i32 %1, 0 br i1 %tobool2, label %for.body3, label %for.end for.body3: ; preds = %for.cond1 %2 = load i32* @d, align 4, !tbaa !4 %dec = add nsw i32 %2, -1 store i32 %dec, i32* @d, align 4, !tbaa !4 br label %for.cond1 for.end: ; preds = %for.cond1 %3 = load i32* @d, align 4, !tbaa !4 %tobool4 = icmp ne i32 %3, 0 br i1 %tobool4, label %cond.true, label %cond.false cond.true: ; preds = %for.end %4 = load i32* @a, align 4, !tbaa !4 %5 = load i32* @d, align 4, !tbaa !4 %rem = srem i32 %4, %5 br label %cond.end cond.false: ; preds = %for.end br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %rem, %cond.true ], [ 0, %cond.false ] store i32 %cond, i32* %h, align 4, !tbaa !4 %6 = load i32* %h, align 4, !tbaa !4 %7 = load i32* @c, align 4, !tbaa !4 %and = and i32 %7, %6 store i32 %and, i32* @c, align 4, !tbaa !4 br label %for.inc for.inc: ; preds = %cond.end %8 = load i8* @e, align 1, !tbaa !1 %inc = add i8 %8, 1 store i8 %inc, i8* @e, align 1, !tbaa !1 br label %for.cond for.end5: ; preds = %for.cond ret void } ; Function Attrs: nounwind ssp uwtable define i32 @main() #0 { entry: %retval = alloca i32, align 4 store i32 0, i32* %retval call void @fn1() ret i32 0 } attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.ident = !{!0} !0 = metadata !{metadata !"clang version 3.5.0 (210980)"} !1 = metadata !{metadata !2, metadata !2, i64 0} !2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0} !3 = metadata !{metadata !"Simple C/C++ TBAA"} !4 = metadata !{metadata !5, metadata !5, i64 0} !5 = metadata !{metadata !"int", metadata !2, i64 0}
Looks like a bug in the loop vectorizer: $ ./opt -tbaa -basicaa -inline -early-cse -instcombine -simplifycfg -loop-rotate -licm -loop-unswitch -gvn -sccp -instcombine -correlated-propagation -dse 20059.ll -S -o pre_vector.ll $ ./opt -loop-vectorize pre_vector.ll -S Instruction does not dominate all uses! %6 = zext i8 %0 to i32 %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ %6, %for.cond1.preheader.lr.ph.split.us.i.split.us ] LLVM ERROR: Broken function found, compilation aborted!
The original code looks like it is a duplicate PR19823 (a bug in the runtime unroller similar to what we had in the vectorizer). What happens if this is compiled with -fno-unroll-loops?
Hi Arnold - thanks for your help. Was comment 3 intended for bug 20063? I don't see any difference in this test case with -fno-unroll-loops: $ ./clang -O3 20059.c && ./a.out Floating point exception: 8 $ ./clang -O3 -fno-unroll-loops 20059.c && ./a.out Floating point exception: 8
(In reply to comment #2) > Looks like a bug in the loop vectorizer: > > $ ./opt -tbaa -basicaa -inline -early-cse -instcombine -simplifycfg > -loop-rotate -licm -loop-unswitch -gvn -sccp -instcombine > -correlated-propagation -dse 20059.ll -S -o pre_vector.ll > > $ ./opt -loop-vectorize pre_vector.ll -S > Instruction does not dominate all uses! > %6 = zext i8 %0 to i32 > %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ %6, > %for.cond1.preheader.lr.ph.split.us.i.split.us ] > LLVM ERROR: Broken function found, compilation aborted! Yes, this is a separate error that I introduced :( when I fixed the back edge overflow error. I have fixed this in ToT (r211460). But, there is a separate error causing the bug reported here. It is not an error in the loop vectorizer. If it dump the IR right after the vectorizer has vectorized the loop and llc that test case it works fine. However, instcombine creates an srem on a partially undef vector. % rm -f a.out && bin/opt -basicaa -loop-vectorize -S < t3.ll | bin/llc - -o - > tmp.s && bin/clang tmp.s && ./a.out vs. % rm -f a.out && bin/opt -basicaa -loop-vectorize -instcombine -S < t3.ll | bin/llc - -o - > tmp.s && bin/clang tmp.s && ./a.out Floating point exception: 8 The vectorized loop will be: vector.ph: ; preds = %overflow.checked %broadcast.splatinsert3 = insertelement <4 x i32> undef, i32 %1, i32 0 %broadcast.splat4 = shufflevector <4 x i32> %broadcast.splatinsert3, <4 x i32> undef, <4 x i32> zeroinitializer // no undef %broadcast.splatinsert5 = insertelement <4 x i32> undef, i32 %.pr6.us.us.pre.i, i32 0 %broadcast.splat6 = shufflevector <4 x i32> %broadcast.splatinsert5, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ %4, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ %7, %vector.ph ], [ %10, %vector.body ] %8 = trunc i32 %index to i8 %broadcast.splatinsert = insertelement <4 x i8> undef, i8 %8, i32 0 %broadcast.splat = shufflevector <4 x i8> %broadcast.splatinsert, <4 x i8> undef, <4 x i32> zeroinitializer %induction = add <4 x i8> %broadcast.splat, <i8 0, i8 1, i8 2, i8 3> %9 = srem <4 x i32> %broadcast.splat4, %broadcast.splat6 %10 = and <4 x i32> %9, %vec.phi %11 = add <4 x i8> %induction, <i8 1, i8 1, i8 1, i8 1> %12 = icmp eq <4 x i8> %11, zeroinitializer %index.next = add i32 %index, 4 %13 = icmp eq i32 %index.next, %end.idx.rnd.down br i1 %13, label %middle.block, label %vector.body, !llvm.loop !6 What is important is to note that the srem operation on the vector on vector body does *not* operate on undef values. Now, if i ran instcombine over this IR it will wrongly combine this to: vector.ph: ; preds = %overflow.checked %broadcast.splatinsert3 = insertelement <4 x i32> undef, i32 %1, i32 0 // partial undef vector %broadcast.splatinsert5 = insertelement <4 x i32> undef, i32 %.pr6.us.us.pre.i, i32 0 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ %2, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ %7, %vector.ph ], [ %12, %vector.body ] %vec.phi2 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %13, %vector.body ] %8 = srem <4 x i32> %broadcast.splatinsert3, %broadcast.splatinsert5 // SREM on undef! %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer %10 = srem <4 x i32> %broadcast.splatinsert3, %broadcast.splatinsert5 %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer %12 = and <4 x i32> %9, %vec.phi %13 = and <4 x i32> %11, %vec.phi2 %index.next = add i32 %index, 8 %14 = icmp eq i32 %index.next, %end.idx.rnd.down And we get a division on undefined!
Created attachment 12692 [details] t3.ll testcase
In InstructionCombining.cpp -> InstCombiner::SimplifyVectorOp(): // If both arguments of binary operation are shuffles, which use the same // mask and shuffle within a single vector, it is worthwhile to move the // shuffle after binary operation: // Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m) So this: @a = global i32 1, align 4 @b = global i32 1, align 4 define i32 @main() #0 { entry: %a1 = load i32* @a, align 4 %b1 = load i32* @b, align 4 %insert1 = insertelement <4 x i32> undef, i32 %a1, i32 0 %insert2 = insertelement <4 x i32> undef, i32 %b1, i32 0 %splat1 = shufflevector <4 x i32> %insert1, <4 x i32> undef, <4 x i32> zeroinitializer %splat2 = shufflevector <4 x i32> %insert2, <4 x i32> undef, <4 x i32> zeroinitializer %bar = srem <4 x i32> %splat1, %splat2 %ext1 = extractelement <4 x i32> %bar, i32 0 %ext2 = extractelement <4 x i32> %bar, i32 1 store i32 %ext1, i32* @a, align 4 store i32 %ext2, i32* @b, align 4 ret i32 0 } ------------------------------------------- Becomes this: @a = global i32 1, align 4 @b = global i32 1, align 4 define i32 @main() { entry: %a1 = load i32* @a, align 4 %b1 = load i32* @b, align 4 %insert1 = insertelement <4 x i32> undef, i32 %a1, i32 0 %insert2 = insertelement <4 x i32> undef, i32 %b1, i32 0 !!! the splats are gone, but we're still doing a vector srem %0 = srem <4 x i32> %insert1, %insert2 %ext1 = extractelement <4 x i32> %0, i32 0 %ext2 = extractelement <4 x i32> %0, i32 0 <--- changed from element 1 store i32 %ext1, i32* @a, align 4 store i32 %ext2, i32* @b, align 4 ret i32 0 } -------------------------------------------- The x86-64 backend (any backend?) apparently ignores the vector srem op because it sees we're only using the 0 element, so this code doesn't fail even though the IR is wrong. But a more complex test case will show the div-by-0 error: $ cat 20059.ll target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" @a = global i32 1, align 4 define i32 @main() #0 { entry: %a1 = load i32* @a, align 4 %insert0 = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 %a1, i32 0 %insert1 = insertelement <4 x i32> undef, i32 %a1, i32 0 %insert2 = insertelement <4 x i32> undef, i32 %a1, i32 0 %splat1 = shufflevector <4 x i32> %insert1, <4 x i32> undef, <4 x i32> zeroinitializer %splat2 = shufflevector <4 x i32> %insert2, <4 x i32> undef, <4 x i32> zeroinitializer br label %body body: %index = phi i32 [ %a1, %entry ], [ %index.next, %body ] %vec.phi = phi <4 x i32> [ %insert0, %entry ], [ %foo1, %body ] %bar6 = srem <4 x i32> %splat1, %splat2 %foo1 = and <4 x i32> %bar6, %vec.phi %index.next = add i32 %index, 1024 %indexcmp = icmp eq i32 %index.next, 1 br i1 %indexcmp, label %exit, label %body exit: %foo3 = extractelement <4 x i32> %foo1, i32 0 store i32 %foo3, i32* @a, align 4 ret i32 0 } $ ./opt 20059.ll -instcombine | ./llc | ./clang -x assembler - -o a.out && ./a.out Floating point exception: 8
Maybe this is another case of needing to guard against ops that can trap (bug 17073 and bug 16729)? What about vector FP ops or vector compares that generate selection masks?
We can defeat post-optimization by splitting into 2 functions: @a = global i32 1, align 4 @b = global i32 1, align 4 define <4 x i32> @foo(<4 x i32> %p1, <4 x i32> %p2) { entry: %splat1 = shufflevector <4 x i32> %p1, <4 x i32> undef, <4 x i32> zeroinitializer %splat2 = shufflevector <4 x i32> %p2, <4 x i32> undef, <4 x i32> zeroinitializer %retval = srem <4 x i32> %splat1, %splat2 ret <4 x i32> %retval } define i32 @main() { %a1 = load i32* @a, align 4 %b1 = load i32* @b, align 4 %insert1 = insertelement <4 x i32> zeroinitializer, i32 %a1, i32 0 %insert2 = insertelement <4 x i32> zeroinitializer, i32 %b1, i32 0 %retfoo = call <4 x i32> @foo(<4 x i32> %insert1, <4 x i32> %insert2) ret i32 0 } $ ./opt srem.ll | ./llc | ./clang -x assembler - -o a.out && ./a.out $ ./opt srem.ll -instcombine | ./llc | ./clang -x assembler - -o a.out && ./a.out Floating point exception: 8
Even better - remove the globals and loads: define <4 x i32> @foo(<4 x i32> %p1, <4 x i32> %p2) { entry: %splat1 = shufflevector <4 x i32> %p1, <4 x i32> undef, <4 x i32> zeroinitializer %splat2 = shufflevector <4 x i32> %p2, <4 x i32> undef, <4 x i32> zeroinitializer %retval = srem <4 x i32> %splat1, %splat2 ret <4 x i32> %retval } define i32 @main() { %insert1 = insertelement <4 x i32> zeroinitializer, i32 1, i32 0 %insert2 = insertelement <4 x i32> zeroinitializer, i32 1, i32 0 %retfoo = call <4 x i32> @foo(<4 x i32> %insert1, <4 x i32> %insert2) ret i32 0 }
One line patch posted for review: http://reviews.llvm.org/D4424 I've convinced myself that integer comparisons will be ok with this transform, but I don't think we should be doing this transform for any vector FP ops. Performing any kind of op on unknown FP operands could cause an exception. If that is correct, then I can submit another one line patch to disallow FP vectors and this test case in test/Transforms/InstCombine/vec_shuffle.ll should be removed: define <4 x float> @shuffle_17fsub(<4 x float> %v1, <4 x float> %v2) nounwind uwtable { ; CHECK-LABEL: @shuffle_17fsub( ; CHECK-NOT: shufflevector ; CHECK: fsub <4 x float> %v1, %v2 ; CHECK: shufflevector %t1 = shufflevector <4 x float> %v1, <4 x float> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 0> %t2 = shufflevector <4 x float> %v2, <4 x float> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 0> %r = fsub <4 x float> %t1, %t2 ret <4 x float> %r }
Fixed with: http://llvm.org/viewvc/llvm-project?view=revision&revision=212629 The FP question is still open. I'll try to make a test case for that when I get a chance.