define void @test(i32* %p, i32* %p2) { %v = load i32, i32* %p, align 4 %v2 = load i32, i32* %p2, align 4 %cmp = icmp ult i32 %v2, %v %sel = select i1 %cmp, i32* %p2, i32* %p %p8 = bitcast i32* %p to i8* %sel8 = bitcast i32* %sel to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p8, i8* align 4 %sel8, i64 4, i1 false) ret void } ; Function Attrs: argmemonly nounwind willreturn declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0 attributes #0 = { argmemonly nounwind willreturn } This loops on LLVM 10. It is fixed in master as a side-effect of https://github.com/llvm/llvm-project/commit/80581966771a6d705daddbbf640a91c94652ceb5. InstCombine log: INSTCOMBINE ITERATION #1 on test IC: ADDING: 8 instrs to worklist IC: Visiting: %v = load i32, i32* %p, align 4 IC: Visiting: %v2 = load i32, i32* %p2, align 4 IC: Visiting: %cmp = icmp ult i32 %v2, %v IC: Visiting: %sel = select i1 %cmp, i32* %p2, i32* %p IC: Visiting: %p8 = bitcast i32* %p to i8* IC: Visiting: %sel8 = bitcast i32* %sel to i8* IC: Visiting: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p8, i8* align 4 %sel8, i64 4, i1 false) IC: ADD: %1 = bitcast i8* %sel8 to i32* IC: ADD: %2 = bitcast i8* %p8 to i32* IC: ADD: %3 = load i32, i32* %1 IC: ADD: store i32 %3, i32* %2 IC: Mod = call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p8, i8* align 4 %sel8, i64 4, i1 false) New = call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p8, i8* align 4 %sel8, i64 0, i1 false) IC: ADD: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p8, i8* align 4 %sel8, i64 0, i1 false) IC: Visiting: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p8, i8* align 4 %sel8, i64 0, i1 false) IC: ERASE call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p8, i8* align 4 %sel8, i64 0, i1 false) IC: ADD: %p8 = bitcast i32* %p to i8* IC: ADD: %sel8 = bitcast i32* %sel to i8* IC: Visiting: %sel8 = bitcast i32* %sel to i8* IC: Visiting: %p8 = bitcast i32* %p to i8* IC: Visiting: store i32 %3, i32* %2, align 4 IC: ADD: %3 = load i32, i32* %1 IC: ADD: store i32 %3, i32* %2 IC: ADD: store i32 %4, i32* %2, align 4 IC: Replacing %4 = load i32, i32* %1, align 4 with i32 undef IC: ERASE %4 = load i32, i32* %1, align 4 IC: ERASE store i32 undef, i32* %2, align 4 IC: Visiting: store i32 %3, i32* %2, align 4 IC: ADD: %3 = load i32, i32* %1 IC: ADD: store i32 %3, i32* %2 IC: ADD: store i32 %4, i32* %2, align 4 IC: Replacing %4 = load i32, i32* %1, align 4 with i32 undef IC: ERASE %4 = load i32, i32* %1, align 4 IC: ERASE store i32 undef, i32* %2, align 4 The memcpy() transform leaves behind a redundant bitcast-of-bitcast, which (prior to worklist order fixes) never gets a chance to be cleaned up, because the removeBitcastsFromLoadStoreOnMinMax() transform goes into an infinite loop when it encounters this structure: define void @test(i32* %p, i32* %p2) { %v = load i32, i32* %p, align 4 %v2 = load i32, i32* %p2, align 4 %cmp = icmp ult i32 %v2, %v %sel = select i1 %cmp, i32* %p2, i32* %p %p8 = bitcast i32* %p to i8* %sel8 = bitcast i32* %sel to i8* %1 = bitcast i8* %sel8 to i32* %2 = bitcast i8* %p8 to i32* %3 = load i32, i32* %1, align 4 store i32 %3, i32* %2, align 4 ret void }
Candidate patch: https://reviews.llvm.org/D74278
Landed in https://reviews.llvm.org/rG23db9724d0e5490fa5a2a726acf015f84e2c87cf, keeping open to track backport.
(In reply to Nikita Popov from comment #2) > Landed in > https://reviews.llvm.org/rG23db9724d0e5490fa5a2a726acf015f84e2c87cf, keeping > open to track backport. Thanks! Cherry-picked as 9db3e5d5156bc2a3ba8ec0d70ab7069a82472fbb