49688 – WRONG code CFGOpt? InstCombine?

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 49688 - WRONG code CFGOpt? InstCombine?

Summary: WRONG code CFGOpt? InstCombine?

Status:	RESOLVED FIXED

Alias:	None

Product:	libraries
Classification:	Unclassified
Component:	Scalar Optimizations (show other bugs)
Version:	trunk
Hardware:	PC Linux

Importance:	P normal
Assignee:	Juneyoung Lee

URL:
Keywords:

Depends on:
Blocks:

Reported:	2021-03-22 16:01 PDT by Jonas Paulsson
Modified:	2021-05-10 18:51 PDT (History)
CC List:	6 users (show)

See Also:	50145
Fixed By Commit(s):	5207cde5cb4147155c469e1861427ea9d569bd5a

Attachments
reduced testcase (173 bytes, text/plain) 2021-03-22 16:01 PDT, Jonas Paulsson	Details
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Jonas Paulsson 2021-03-22 16:01:22 PDT

Created attachment 24682 [details]
reduced testcase

This program (wrong0.i):

static int *a;
int b, d;
int *c = &b;
int main() {
  int e[3] = {2, 1, -2139220656};
  a = &e[0];
  d = (e[2] < 0) || (e[2] > (7 >> e[2]));
  *c = d;
  printf("%d\n", b);
}

I beleive the right-shift with a known negative value is undefined, but since the first expression is always true, the program as a whole is still well-defined, or?

Since e[2] is always less than 0, I would expect the second expression (after the '||') to never be evaluated. However, SimplifyCFGPass does not understand that the first expression is always true and so merges them so that they are always both executed. I wonder why this is allowed...?

  %1 = load i32, i32* %arrayidx1, align 4, !tbaa !6
  %cmp = icmp slt i32 %1, 0
  %shr = ashr i32 7, %1
  %cmp4 = icmp sgt i32 %1, %shr
  %2 = select i1 %cmp, i1 true, i1 %cmp4

Then InstCombine decides to remove the first check against less than zero:

  %arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* %e, i64 0, i64 2
  %1 = load i32, i32* %arrayidx1, align 4, !tbaa !6
  %shr = lshr i32 7, %1
  %2 = icmp ugt i32 %1, %shr
  %lor.ext = zext i1 %2 to i32
  store i32 %lor.ext, i32* @d, align 4, !tbaa !6

So it seems that the user has written a well-defined program, but the compiler has failed to realize this through the known constants.


clang -march=arch13 -O1 wrong0.i -o a.out -w; ./a.out
1
clang -march=arch13 -O2 wrong0.i -o a.out -w; ./a.out
0
clang -march=arch13 -O2 wrong0.i -o a.out -mllvm -enable-gvn-memdep=false -w; ./a.out
1
clang -march=arch13 -O2 wrong0.i -o a.out -mllvm -memssa-check-limit=0 -w; ./a.out
0

The store of '0' to @d is then later produced by GVN:

*** IR Dump After MergedLoadStoreMotionPass ***
; Function Attrs: nofree nounwind
define dso_local signext i32 @main() local_unnamed_addr #0 {
entry:
  %e = alloca [3 x i32], align 4
  %0 = bitcast [3 x i32]* %e to i8*
  call void @llvm.lifetime.start.p0i8(i64 12, i8* nonnull %0) #2
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) %0, i8* noundef nonnull align 4 dereferenceable(12) bitcast ([3 x i32]* @__const.main.e to i8*), i64 12, i1 false)
  %arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %e, i64 0, i64 0
  store i32* %arrayidx, i32** @a, align 8, !tbaa !2
  %arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* %e, i64 0, i64 2
  %1 = load i32, i32* %arrayidx1, align 4, !tbaa !6
  %shr = lshr i32 7, %1
  %2 = icmp ugt i32 %1, %shr
  %lor.ext = zext i1 %2 to i32
  store i32 %lor.ext, i32* @d, align 4, !tbaa !6
  %3 = load i32*, i32** @c, align 8, !tbaa !2
  store i32 %lor.ext, i32* %3, align 4, !tbaa !6
  %4 = load i32, i32* @b, align 4, !tbaa !6
  %call = call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 signext %4)
  call void @llvm.lifetime.end.p0i8(i64 12, i8* nonnull %0) #2
  ret i32 0
}

*** IR Dump After GVN ***
; Function Attrs: nofree nounwind
define dso_local signext i32 @main() local_unnamed_addr #0 {
entry:
  %e = alloca [3 x i32], align 4
  %0 = bitcast [3 x i32]* %e to i8*
  call void @llvm.lifetime.start.p0i8(i64 12, i8* nonnull %0) #2
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) %0, i8* noundef nonnull align 4 dereferenceable(12) bitcast ([3 x i32]* @__const.main.e to i8*), i64 12, i1 false)
  %arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %e, i64 0, i64 0
  store i32* %arrayidx, i32** @a, align 8, !tbaa !2
  %arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* %e, i64 0, i64 2
  store i32 0, i32* @d, align 4, !tbaa !6
  %1 = load i32*, i32** @c, align 8, !tbaa !2
  store i32 0, i32* %1, align 4, !tbaa !6
  %2 = load i32, i32* @b, align 4, !tbaa !6
  %call = call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 signext %2)
  call void @llvm.lifetime.end.p0i8(i64 12, i8* nonnull %0) #2
  ret i32 0
}

Comment 1 Juneyoung Lee 2021-03-22 17:36:43 PDT

A reduced input:
```
define i1 @f(i32 %i1) {
entry:
  %cmp = icmp slt i32 %i1, 0
  %shr = ashr i32 7, %i1
  %cmp4 = icmp sgt i32 %i1, %shr
  %i2 = select i1 %cmp, i1 true, i1 %cmp4
  ret i1 %i2
}
```

It's again due to the old transformation in InstCombine 'select i1 %a, i1 true, i1 %b' => 'or i1 %a, %b'.
The former expression effectively stops propagation of the result of invalid shift whereas the latter one allows its propagation, starting reasoning between two icmps and simplifying the result.

The transformation should be disabled; I'm sending patches to minimize performance impact after disabling it (https://reviews.llvm.org/D93065).

Comment 2 Jonas Paulsson 2021-03-30 13:03:13 PDT

I think I saw this again:

define dso_local signext i32 @f(i32 signext %g, i32 zeroext %h) local_unnamed_addr #0 {
entry:
  %cmp = icmp slt i32 %g, 0
  %shr = ashr i32 7, %h
  %cmp1 = icmp sgt i32 %g, %shr
  %0 = select i1 %cmp, i1 true, i1 %cmp1
  %lor.ext = zext i1 %0 to i32
  ret i32 %lor.ext
}

=> 

*** IR Dump After InstCombinePass ***
define dso_local signext i32 @f(i32 signext %g, i32 zeroext %h) local_unnamed_addr #0 {
entry:
  %shr = lshr i32 7, %h
  %0 = icmp ult i32 %shr, %g
  %lor.ext = zext i1 %0 to i32
  ret i32 %lor.ext
}

This doesn't look right - the check if g < 0 is simply removed for some reason.

opt -O2 fun.ll -march=z14

Input:
define i32 @f(i32 signext %g, i32 zeroext %h) #0 {
entry:
  %g.addr = alloca i32, align 4
  %h.addr = alloca i32, align 4
  store i32 %g, i32* %g.addr, align 4
  store i32 %h, i32* %h.addr, align 4
  %0 = load i32, i32* %g.addr, align 4
  %cmp = icmp slt i32 %0, 0
  br i1 %cmp, label %lor.end, label %lor.rhs

lor.rhs:                                          ; preds = %entry                                                                                                            
  %1 = load i32, i32* %g.addr, align 4
  %2 = load i32, i32* %h.addr, align 4
  %shr = ashr i32 7, %2
  %cmp1 = icmp sgt i32 %1, %shr
  br label %lor.end

lor.end:                                          ; preds = %lor.rhs, %entry                                                                                                  
  %3 = phi i1 [ true, %entry ], [ %cmp1, %lor.rhs ]
  %lor.ext = zext i1 %3 to i32
  ret i32 %lor.ext
}

Comment 3 Roman Lebedev 2021-03-31 02:47:09 PDT

(In reply to Jonas Paulsson from comment #2)
> I think I saw this again:
> 
> define dso_local signext i32 @f(i32 signext %g, i32 zeroext %h)
> local_unnamed_addr #0 {
> entry:
>   %cmp = icmp slt i32 %g, 0
>   %shr = ashr i32 7, %h
>   %cmp1 = icmp sgt i32 %g, %shr
>   %0 = select i1 %cmp, i1 true, i1 %cmp1
>   %lor.ext = zext i1 %0 to i32
>   ret i32 %lor.ext
> }
> 
> => 
> 
> *** IR Dump After InstCombinePass ***
> define dso_local signext i32 @f(i32 signext %g, i32 zeroext %h)
> local_unnamed_addr #0 {
> entry:
>   %shr = lshr i32 7, %h
>   %0 = icmp ult i32 %shr, %g
>   %lor.ext = zext i1 %0 to i32
>   ret i32 %lor.ext
> }
> 
> This doesn't look right - the check if g < 0 is simply removed for some
> reason.
> 
> opt -O2 fun.ll -march=z14
> 
> Input:
> define i32 @f(i32 signext %g, i32 zeroext %h) #0 {
> entry:
>   %g.addr = alloca i32, align 4
>   %h.addr = alloca i32, align 4
>   store i32 %g, i32* %g.addr, align 4
>   store i32 %h, i32* %h.addr, align 4
>   %0 = load i32, i32* %g.addr, align 4
>   %cmp = icmp slt i32 %0, 0
>   br i1 %cmp, label %lor.end, label %lor.rhs
> 
> lor.rhs:                                          ; preds = %entry          
> 
>   %1 = load i32, i32* %g.addr, align 4
>   %2 = load i32, i32* %h.addr, align 4
>   %shr = ashr i32 7, %2
>   %cmp1 = icmp sgt i32 %1, %shr
>   br label %lor.end
> 
> lor.end:                                          ; preds = %lor.rhs, %entry
> 
>   %3 = phi i1 [ true, %entry ], [ %cmp1, %lor.rhs ]
>   %lor.ext = zext i1 %3 to i32
>   ret i32 %lor.ext
> }

Yep, https://alive2.llvm.org/ce/z/NGCDRB

Comment 4 Juneyoung Lee 2021-03-31 03:33:33 PDT

I'll make a patch that conditionally disables select -> and/or transformation. I think now it's time to start ditching out the optimization.

Comment 5 Juneyoung Lee 2021-03-31 10:48:39 PDT

https://reviews.llvm.org/D99674

Comment 6 Juneyoung Lee 2021-04-08 18:27:12 PDT

Fixed via https://reviews.llvm.org/rG5207cde5cb4147155c469e1861427ea9d569bd5a