test: Transforms/MergeICmps/X86/entry-block-shuffled.ll src: compares 12/8, br, 0/0, br, 4/4, br, 12/12 tgt: compares 0/0, 4/4, br, 12/8, br, 12/12 The issue is that tgt branches on 0-8 values, which can be poison, while src wouldn't branch on those if e.g. 12/8 offsets are different. Needs a freeze. define i1 @opeq1(* nocapture nowrite dereferenceable(16) %a, * nocapture nowrite dereferenceable(16) %b) nofree { %entry: %first.i = gep inbounds * nocapture nowrite dereferenceable(16) %a, 16 x i64 0, 1 x i64 12 %0 = load i32, * %first.i, align 4 %first1.i = gep inbounds * nocapture nowrite dereferenceable(16) %b, 16 x i64 0, 1 x i64 8 %1 = load i32, * %first1.i, align 4 %cmp.i = icmp eq i32 %0, %1 br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit %land.rhs.i: %second.i = gep inbounds * nocapture nowrite dereferenceable(16) %a, 16 x i64 0, 1 x i64 0 %2 = load i32, * %second.i, align 4 %second2.i = gep inbounds * nocapture nowrite dereferenceable(16) %b, 16 x i64 0, 1 x i64 0 %3 = load i32, * %second2.i, align 4 %cmp3.i = icmp eq i32 %2, %3 br i1 %cmp3.i, label %land.rhs.i.2, label %opeq1.exit %land.rhs.i.2: %third.i = gep inbounds * nocapture nowrite dereferenceable(16) %a, 16 x i64 0, 1 x i64 4 %4 = load i32, * %third.i, align 4 %third2.i = gep inbounds * nocapture nowrite dereferenceable(16) %b, 16 x i64 0, 1 x i64 4 %5 = load i32, * %third2.i, align 4 %cmp4.i = icmp eq i32 %4, %5 br i1 %cmp4.i, label %land.rhs.i.3, label %opeq1.exit %land.rhs.i.3: %fourth.i = gep inbounds * nocapture nowrite dereferenceable(16) %a, 16 x i64 0, 1 x i64 12 %6 = load i32, * %fourth.i, align 4 %fourth2.i = gep inbounds * nocapture nowrite dereferenceable(16) %b, 16 x i64 0, 1 x i64 12 %7 = load i32, * %fourth2.i, align 4 %cmp5.i = icmp eq i32 %6, %7 br label %opeq1.exit %opeq1.exit: %8 = phi i1 [ 0, %entry ], [ 0, %land.rhs.i ], [ 0, %land.rhs.i.2 ], [ %cmp5.i, %land.rhs.i.3 ] ret i1 %8 } => define i1 @opeq1(* nocapture nowrite dereferenceable(16) %a, * nocapture nowrite dereferenceable(16) %b) nofree { %land.rhs.i+land.rhs.i.2: %0 = gep inbounds * nocapture nowrite dereferenceable(16) %a, 16 x i64 0, 1 x i64 0 %1 = gep inbounds * nocapture nowrite dereferenceable(16) %b, 16 x i64 0, 1 x i64 0 %cstr = bitcast * %0 to * %cstr3 = bitcast * %1 to * %memcmp = memcmp * %cstr, * %cstr3, i64 8 %2 = icmp eq i32 %memcmp, 0 br i1 %2, label %entry2, label %opeq1.exit %entry2: %3 = gep inbounds * nocapture nowrite dereferenceable(16) %a, 16 x i64 0, 1 x i64 12 %4 = gep inbounds * nocapture nowrite dereferenceable(16) %b, 16 x i64 0, 1 x i64 8 %5 = load i32, * %3, align 4 %6 = load i32, * %4, align 4 %7 = icmp eq i32 %5, %6 br i1 %7, label %land.rhs.i.31, label %opeq1.exit %land.rhs.i.31: %8 = gep inbounds * nocapture nowrite dereferenceable(16) %a, 16 x i64 0, 1 x i64 12 %9 = gep inbounds * nocapture nowrite dereferenceable(16) %b, 16 x i64 0, 1 x i64 12 %10 = load i32, * %8, align 4 %11 = load i32, * %9, align 4 %12 = icmp eq i32 %10, %11 br label %opeq1.exit %opeq1.exit: %13 = phi i1 [ %12, %land.rhs.i.31 ], [ 0, %entry2 ], [ 0, %land.rhs.i+land.rhs.i.2 ] ret i1 %13 } Transformation doesn't verify! ERROR: Source is more defined than target Example: * nocapture nowrite dereferenceable(16) %a = pointer(non-local, block_id=1, offset=10, attrs=3) * nocapture nowrite dereferenceable(16) %b = pointer(non-local, block_id=2, offset=88, attrs=3) Source: * %first.i = pointer(non-local, block_id=1, offset=22, attrs=3) i32 %0 = #x80000002 (2147483650, -2147483646) * %first1.i = pointer(non-local, block_id=2, offset=96, attrs=3) i32 %1 = #x00010100 (65792) i1 %cmp.i = #x0 (0) >> Jump to %opeq1.exit i1 %8 = #x0 (0) SOURCE MEMORY STATE =================== NON-LOCAL BLOCKS: Block 0 > size: 0 align: 1 alloc type: 0 address: 0 Block 1 > size: 129 align: 2 alloc type: 0 address: 122 Block 2 > size: 105 align: 2 alloc type: 0 address: 16 Target: * %0 = pointer(non-local, block_id=1, offset=10, attrs=3) * %1 = pointer(non-local, block_id=2, offset=88, attrs=3) * %cstr = pointer(non-local, block_id=1, offset=10, attrs=3) * %cstr3 = pointer(non-local, block_id=2, offset=88, attrs=3) i32 %memcmp = poison i1 %2 = poison UB triggered on br
Yeah, this is a known issue. The problem is that we don't have a way to say that @memcmp should work on frozen values. Freezing the arguments would just freeze the pointer addresses. Or possibly we should assume that @memcmp always works like this (note that this is a libcall, not an intrinsic). I think that may be fairly reasonable as long as the definition of memcmp is not known.
(In reply to Nikita Popov from comment #1) > Yeah, this is a known issue. The problem is that we don't have a way to say > that @memcmp should work on frozen values. Freezing the arguments would just > freeze the pointer addresses. > > Or possibly we should assume that @memcmp always works like this (note that > this is a libcall, not an intrinsic). I think that may be fairly reasonable > as long as the definition of memcmp is not known. The only concern with that approach is that then lowering memcmp in IR becomes trickier (-expandmemcmp). But otherwise it's ok, yes.
I fixed the part of this that can be fixed in MergeICmps in https://github.com/llvm/llvm-project/commit/f2fa6ad0476b318fdba46f09a2d59187228431ee. The remaining issue has to be addressed in ExpandMemCmp. I believe that one would need freeze even without reordering, because it generates wide loads.