Failure to simplify SIMD vector conversion. #17113

silvasean · 2013-07-29T23:28:08Z


Bugzilla Link	16739
Version	trunk
OS	All
CC	@filcab,@LebedevRI,@RKSimon,@rotateright
Fixed by commit(s)	r366441

Extended Description

This code was reduced from a function that converted between SIMD vector classes used by two different libraries; the source and destination vectors have a <4 x float> underlying storage, but notionally hold only {x, y, z} (and the destination duplicates z into the last lane; the source leaves it undefined I think).

typedef float __m128 __attribute__((__vector_size__(16)));
union ElementWiseAccess {
  ElementWiseAccess(__m128 v) : ReprM128(v) {}
  __m128 ReprM128;
  float ReprFloatArray[4];
  float getAt(int i) const { return ReprFloatArray[i]; }
};
// Making this return `const ElementWiseAccess` instead of `const ElementWiseAccess &`
// still results in a failure to optimize, but in a different way.
static const ElementWiseAccess &castToElementWiseAccess(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}
__m128 ConvertVectors(const __m128 &V) {
  // Replacing `castToElementWiseAccess` with directly calling
  // `ElementWiseAccess` makes the issue go away.
  return (__m128) { castToElementWiseAccess(V).getAt(0), //
                    castToElementWiseAccess(V).getAt(1), //
                    castToElementWiseAccess(V).getAt(2), //
                    castToElementWiseAccess(V).getAt(2) };
}

clang -O3 produces:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = bitcast <4 x float>* %V to [4 x float]*
  %2 = getelementptr inbounds <4 x float>* %V, i64 0, i64 0
  %3 = load float* %2, align 4, !tbaa !0
  %4 = insertelement <4 x float> undef, float %3, i32 0
  %5 = getelementptr inbounds [4 x float]* %1, i64 0, i64 1
  %6 = load float* %5, align 4, !tbaa !0
  %7 = insertelement <4 x float> %4, float %6, i32 1
  %8 = getelementptr inbounds [4 x float]* %1, i64 0, i64 2
  %9 = load float* %8, align 4, !tbaa !0
  %10 = insertelement <4 x float> %7, float %9, i32 2
  %11 = insertelement <4 x float> %10, float %9, i32 3
  ret <4 x float> %11
}

It appears that something is interfering with folding the load/insertelement sequence into a vector load + shufflevector.

Making the modification indicated in the comments of having castToElementWiseAccess return by value instead of by reference results in:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = bitcast <4 x float>* %V to i8*
  %2 = bitcast <4 x float>* %V to double*
  %3 = load double* %2, align 16
  %4 = getelementptr inbounds i8* %1, i64 8
  %5 = bitcast i8* %4 to double*
  %6 = bitcast double %3 to i64
  %trunc = trunc i64 %6 to i32
  %bitcast = bitcast i32 %trunc to float
  %7 = insertelement <4 x float> undef, float %bitcast, i32 0
  %8 = lshr i64 %6, 32
  %9 = trunc i64 %8 to i32
  %10 = bitcast i32 %9 to float
  %11 = insertelement <4 x float> %7, float %10, i32 1
  %12 = load double* %5, align 8
  %13 = bitcast double %12 to i64
  %trunc6 = trunc i64 %13 to i32
  %bitcast7 = bitcast i32 %trunc6 to float
  %14 = insertelement <4 x float> %11, float %bitcast7, i32 2
  %15 = insertelement <4 x float> %14, float %bitcast7, i32 3
  ret <4 x float> %15
}

The issue in this case seems to be that clang lowers castToElementWiseAccess as returning {double, double}, which then prevents a <4 x float> load being generated.

Making the modification of replacing the call to castToElementWiseAcess with directly invoking the constructor (e.g. ElementWiseAccess(V).getAt(<<<n>>>)) results in the following code, which is the desired codegen for the initial test case:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = load <4 x float>* %V, align 16, !tbaa !0
  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %2
}

The text was updated successfully, but these errors were encountered:

RKSimon · 2017-03-30T15:50:34Z

Trunk still has issues with this, and it reminds me of [Bug #21780]:

typedef float __m128 __attribute__((__vector_size__(16)));
union ElementWiseAccess {
  ElementWiseAccess(__m128 v) : ReprM128(v) {}
  __m128 ReprM128;
  float ReprFloatArray[4];
  float getAt(int i) const { return ReprFloatArray[i]; }
};

static const ElementWiseAccess &castToElementWiseAccess_ByRef(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}
static const ElementWiseAccess castToElementWiseAccess_ByVal(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}

__m128 ConvertVectors_ByRef(const __m128 &V) {
  return (__m128) { castToElementWiseAccess_ByRef(V).getAt(0), //
                    castToElementWiseAccess_ByRef(V).getAt(1), //
                    castToElementWiseAccess_ByRef(V).getAt(2), //
                    castToElementWiseAccess_ByRef(V).getAt(2) };
}
__m128 ConvertVectors_ByVal(const __m128 &V) {
  return (__m128) { castToElementWiseAccess_ByVal(V).getAt(0), //
                    castToElementWiseAccess_ByVal(V).getAt(1), //
                    castToElementWiseAccess_ByVal(V).getAt(2), //
                    castToElementWiseAccess_ByVal(V).getAt(2) };
}
__m128 ConvertVectors_ByCopy(const __m128 &V) {
  return (__m128) { ElementWiseAccess(V).getAt(0), //
                    ElementWiseAccess(V).getAt(1), //
                    ElementWiseAccess(V).getAt(2), //
                    ElementWiseAccess(V).getAt(2) };
}

Looking at the IR, it knows that the entire vector load is dereferencable, but still makes a mess of combining the inserted loads:

define <4 x float> @ConvertVectors_ByRef(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = bitcast <4 x float>* %0 to [4 x float]*
  %3 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
  %4 = load float, float* %3, align 4, !tbaa !1
  %5 = insertelement <4 x float> undef, float %4, i32 0
  %6 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
  %7 = load float, float* %6, align 4, !tbaa !1
  %8 = insertelement <4 x float> %5, float %7, i32 1
  %9 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
  %10 = load float, float* %9, align 4, !tbaa !1
  %11 = insertelement <4 x float> %8, float %10, i32 2
  %12 = insertelement <4 x float> %11, float %10, i32 3
  ret <4 x float> %12
}

define <4 x float> @ConvertVectors_ByVal(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = bitcast <4 x float>* %0 to i64*
  %3 = load i64, i64* %2, align 16
  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
  %5 = trunc i64 %3 to i32
  %6 = bitcast i32 %5 to float
  %7 = insertelement <4 x float> undef, float %6, i32 0
  %8 = lshr i64 %3, 32
  %9 = trunc i64 %8 to i32
  %10 = bitcast i32 %9 to float
  %11 = insertelement <4 x float> %7, float %10, i32 1
  %12 = bitcast float* %4 to i64*
  %13 = load i64, i64* %12, align 8
  %14 = trunc i64 %13 to i32
  %15 = bitcast i32 %14 to float
  %16 = insertelement <4 x float> %11, float %15, i32 2
  %17 = insertelement <4 x float> %16, float %15, i32 3
  ret <4 x float> %17
}

define <4 x float> @ConvertVectors_ByCopy(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = load <4 x float>, <4 x float>* %0, align 16, !tbaa !5
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %3
}

Resulting in final assembly:

ConvertVectors_ByRef(float __vector(4) const&):        # @ConvertVectors_ByRef(float __vector(4) const&)
        vmovss  8(%rdi), %xmm0          # xmm0 = mem[0],zero,zero,zero
        vmovsd  (%rdi), %xmm1           # xmm1 = mem[0],zero
        vshufps $4, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0,1],xmm0[0,0]
        retq

ConvertVectors_ByVal(float __vector(4) const&):        # @ConvertVectors_ByVal(float __vector(4) const&)
        vmovss  (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
        vmovss  8(%rdi), %xmm1          # xmm1 = mem[0],zero,zero,zero
        vinsertps       $16, 4(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3]
        vshufps $4, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,0]
        retq

ConvertVectors_ByCopy(float __vector(4) const&):       # @ConvertVectors_ByCopy(float __vector(4) const&)
        vpermilps       $164, (%rdi), %xmm0 # xmm0 = mem[0,1,2,2]
        retq

rotateright · 2018-03-23T17:43:53Z

For the original example, the bitcast from vector to array might be interfering with subsequent transforms, so:
https://reviews.llvm.org/D44833

RKSimon · 2018-12-07T16:16:25Z

https://godbolt.org/z/NlK7rA

RKSimon · 2022-09-29T13:40:23Z

Current IR:

define <4 x float> @ConvertVectors_ByRef(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %0) {
  %2 = load <2 x float>, ptr %0, align 16
  %3 = shufflevector <2 x float> %2, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
  %4 = getelementptr inbounds [4 x float], ptr %0, i64 0, i64 2
  %5 = load float, ptr %4, align 8
  %6 = insertelement <4 x float> %3, float %5, i64 2
  %7 = insertelement <4 x float> %6, float %5, i64 3
  ret <4 x float> %7
}

define <4 x float> @ConvertVectors_ByVal(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %0) {
  %2 = load i64, ptr %0, align 16
  %3 = getelementptr i8, ptr %0, i64 8
  %4 = load i64, ptr %3, align 8
  %5 = trunc i64 %2 to i32
  %6 = insertelement <4 x i32> undef, i32 %5, i64 0
  %7 = lshr i64 %2, 32
  %8 = trunc i64 %7 to i32
  %9 = insertelement <4 x i32> %6, i32 %8, i64 1
  %10 = trunc i64 %4 to i32
  %11 = insertelement <4 x i32> %9, i32 %10, i64 2
  %12 = insertelement <4 x i32> %11, i32 %10, i64 3
  %13 = bitcast <4 x i32> %12 to <4 x float>
  ret <4 x float> %13
}

define <4 x float> @ConvertVectors_ByCopy(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %0) {
  %2 = load <4 x float>, ptr %0, align 16
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %3
}

This adapts/copies code from the existing fold that allows widening of load scalar+insert. It can help in IR because it removes a shuffle, and the backend can already narrow loads if that is profitable in codegen. We might be able to consolidate more of the logic, but handling this basic pattern should be enough to make a small difference on one of the motivating examples from issue #17113. The final goal of combining loads on those patterns is not solved though. Differential Revision: https://reviews.llvm.org/D137341

RKSimon · 2022-11-16T14:06:03Z

Current IR:

define <4 x float> @ConvertVectors_ByRef(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %0 = load <4 x float>, ptr %V, align 16
  %arrayidx.i14 = getelementptr inbounds [4 x float], ptr %V, i64 0, i64 2
  %1 = load float, ptr %arrayidx.i14, align 8
  %vecinit7 = insertelement <4 x float> %0, float %1, i64 2
  %vecinit10 = insertelement <4 x float> %vecinit7, float %1, i64 3
  ret <4 x float> %vecinit10
}

define <4 x float> @ConvertVectors_ByVal(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %V.val2536 = load i64, ptr %V, align 16
  %0 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %0, align 8
  %1 = trunc i64 %V.val2536 to i32
  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
  %3 = lshr i64 %V.val2536, 32
  %4 = trunc i64 %3 to i32
  %5 = insertelement <4 x i32> %2, i32 %4, i64 1
  %6 = trunc i64 %V.val2637 to i32
  %7 = insertelement <4 x i32> %5, i32 %6, i64 2
  %8 = insertelement <4 x i32> %7, i32 %6, i64 3
  %vecinit16 = bitcast <4 x i32> %8 to <4 x float>
  ret <4 x float> %vecinit16
}

define <4 x float> @_Z21ConvertVectors_ByCopy(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %0 = load <4 x float>, ptr %V, align 16
  %vecinit9 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %vecinit9
}

RKSimon · 2022-11-16T14:06:52Z

ConvertVectors_ByRef IR has improved, and if we run it through opt -O3 again we get:

define <4 x float> @ConvertVectors_ByRef(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
  %0 = load <4 x float>, ptr %V, align 16
  %vecinit10 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %vecinit10
}

rotateright · 2022-11-16T22:07:38Z

ConvertVectors_ByRef IR has improved, and if we run it through opt -O3 again we get:

b57819e got us the wide load, so now GVN + instcombine can give us the ideal IR.

So now there's a tough question: should we run GVN again post-vectorization (IIUC, it's not cheap in compile-time), should there be a lighter-weight pass that specializes in load combining, or can we get away with some kind of load combining enhancement in SDAG?

rotateright · 2022-11-17T13:15:52Z

I missed another possibility: we can add an early run of vector-combine.

We already do it (!), but it is defaulted off with a debug flag, and that invocation only tries a subset of folds.
That was added last year:
https://reviews.llvm.org/D102496
...and it had a similar motivation - it was specifically added before GVN because that made a difference for matrix ops.

RKSimon · 2022-11-17T14:08:57Z

SGTM - I suppose always enabling it comes down to whether it will have a detrimental effect on compile time?

Based on an example in issue #17113

rotateright · 2022-11-18T21:44:49Z

Compile-time improvement for vector-combine:
87debdadaf18
...hoping that buys enough goodwill to run it a 2nd time :)

rotateright · 2022-11-19T15:51:02Z

Enable early vector-combine proposal:
https://reviews.llvm.org/D138353

RKSimon · 2022-11-21T16:28:55Z

define <4 x float> @ConvertVectors_ByVal(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %V.val2536 = load i64, ptr %V, align 16
  %0 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %0, align 8
  %1 = trunc i64 %V.val2536 to i32
  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
  %3 = lshr i64 %V.val2536, 32
  %4 = trunc i64 %3 to i32
  %5 = insertelement <4 x i32> %2, i32 %4, i64 1
  %6 = trunc i64 %V.val2637 to i32
  %7 = insertelement <4 x i32> %5, i32 %6, i64 2
  %8 = insertelement <4 x i32> %7, i32 %6, i64 3
  %vecinit16 = bitcast <4 x i32> %8 to <4 x float>
  ret <4 x float> %vecinit16
}

I'm not sure what the best way to tackle this one is - should we consider trying harder to vectorize the truncs to something like:

define <4 x float> @ConvertVectors_ByVal_trunc(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %V.val2536 = load i64, ptr %V, align 16
  %0 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %0, align 8
  %1 = insertelement <4 x i64> undef, i64 %V.val2536, i64 0
  %2 = insertelement <4 x i64> %1, i64 %V.val2536, i64 1
  %3 = insertelement <4 x i64> %2, i64 %V.val2637, i64 2
  %4 = insertelement <4 x i64> %3, i64 %V.val2637, i64 3
  %5 = lshr <4 x i64> %4, <i64 0, i64 32, i64 0, i64 0>
  %6 = trunc <4 x i64> %5 to <4 x i32>
  %vecinit16 = bitcast <4 x i32> %6 to <4 x float>
  ret <4 x float> %vecinit16
}

rotateright · 2022-11-21T16:41:36Z

The "ByVal" part of this shows a series of potential missed canonicalizations. I think we want to replace the truncs and shifts with bitcasts and shuffles.

That leads to trade-offs like this:
https://alive2.llvm.org/ce/z/dLqHXU
Ie, we can replace a trunc + insert with a bitcast alone, but then we've lost information that the high elements are undef/poison. Is that better or worse than bitcast + identity shuffle with undef elements in the mask? Note that this will also need adjustments for endian.

If we can do those transforms, then we can hopefully convert the load to a vector load (easier now that we have opaque ptr?). Once that happens, we're almost back to the "ByRef" version of the IR, so then it should fold similarly...

RKSimon · 2022-11-21T17:34:38Z

Ie, we can replace a trunc + insert with a bitcast alone, but then we've lost information that the high elements are undef/poison. Is that better or worse than bitcast + identity shuffle with undef elements in the mask? Note that this will also need adjustments for endian.

I suppose if we can guarantee that we are populating the entire vector then the undef/poison issue is benign.

The option was added with https://reviews.llvm.org/D102496, and currently the name is accurate, but I am hoping to add a load transform that is not a scalarization. See issue #17113.

An early run of VectorCombine was added with D102496 specifically to deal with unnecessary vector ops produced with the C matrix extension. This patch is proposing to try those folds in general and add a pair of load folds to the menu. The load transform will partly solve (see PhaseOrdering diffs) a longstanding vectorization perf bug by removing redundant loads via GVN: issue #17113 The main reason for not enabling the extra pass generally in the initial patch was compile-time cost. The cost of VectorCombine was significantly (surprisingly) improved with: 87debda https://llvm-compile-time-tracker.com/compare.php?from=ffe05b8f57d97bc4340f791cb386c8d00e0739f2&to=87debdadaf18f8a5c7e5d563889e10731dc3554d&stat=instructions:u ...so the extra run is going to cost very little now - the total cost of the 2 runs should be less than the 1 run before that micro-optimization: https://llvm-compile-time-tracker.com/compare.php?from=5e8c2026d10e8e2c93c038c776853bed0e7c8fc1&to=2c4b68eab5ae969811f422714e0eba44c5f7eefb&stat=instructions:u It may be possible to reduce the cost slightly more with a few more earlier-exits like that, but it's probably in the noise based on timing experiments. Differential Revision: https://reviews.llvm.org/D138353

rotateright · 2022-11-21T19:33:42Z

I suppose if we can guarantee that we are populating the entire vector then the undef/poison issue is benign.

Right - although thinking about this example a bit more, we're going to need an identity shuffle either way because the vector (128-bit) is larger than the load (64-bit).

Maybe we can enhance VectorCombine to widen the load first by peeking through the trunc.

I suspect there would be cost problems with trying to increase the vector width to 256-bit as shown in your code example.

RKSimon · 2022-11-21T19:37:34Z

I suspect there would be cost problems with trying to increase the vector width to 256-bit as shown in your code example.

Yes I'd expect the SLP should get that, so if it doesn't then it might not be profitable.

rotateright · 2022-11-22T18:33:12Z

Another micro-optimization for VectorCombine:
ede6d608f4
...that should definitely make it faster to run twice now than it took to run once before.

This is another example from issue #17113

rotateright · 2022-11-29T14:12:44Z

Proposal to canonicalize towards bitcast + shuffle (away from trunc + shift) - it's split into 3 pieces to reduce risk and ease review, but I consider it one fold:
https://reviews.llvm.org/D138872
https://reviews.llvm.org/D138873
https://reviews.llvm.org/D138874

There's an annoying "lack-of-CSE" artifact in the test tracking this bug. We might be able to bend instcombine to deal with that rather than go down another pipeline-altering rabbit hole. That would allow forming a vector load.
Then, we're still left with a final scalar load that GVN is not recognizing as overlapping. Either the load needs to be narrowed or GVN needs to be enhanced to see that.

…ment This replaces patches that tried to convert related patterns to shuffles (D138872, D138873, D138874 - reverted/abandoned) but caused codegen problems and were questionable as a canonicalization because an insertelement is a simpler op than a shuffle. This detects a larger pattern -- insert-of-insert -- and replaces with another insert, so this hopefully does not cause any problems. As noted by TODO items in the code and tests, this could go a lot further. But this is enough to reduce the motivating test from issue #17113. Example proofs: https://alive2.llvm.org/ce/z/NnUv3a I drafted a version of this for AggressiveInstCombine, but it seems that would uncover yet another phase ordering gap. If we do generalize this to handle the full range of potential patterns, that may be worth looking at again. Differential Revision: https://reviews.llvm.org/D139668

rotateright · 2022-12-13T22:15:02Z

We're down to this form with 2 loads after combining the 1st two inserts:

define noundef <4 x float> @_Z20ConvertVectors_ByValRKDv4_f(ptr align 16 dereferenceable(16) %V) {
entry:
  %0 = load <4 x float>, ptr %V, align 16
  %1 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %1, align 8, !tbaa.struct !6
  %2 = trunc i64 %V.val2637 to i32
  %3 = bitcast i32 %2 to float
  %vecinit11 = insertelement <4 x float> %0, float %3, i64 2
  %vecinit16 = insertelement <4 x float> %vecinit11, float %3, i64 3
  ret <4 x float> %vecinit16
}

And GVN + instcombine is able to reduce that to a single load + shuffle, so we have another phase ordering problem.

rotateright · 2022-12-14T19:07:11Z

I think we've really hit the hard problem of phase ordering -- #17113 (comment) -- that we managed to sidestep on the previous example.

I did confirm that just adding a GVN run later in the pipeline is very costly in compile-time:
https://llvm-compile-time-tracker.com/compare.php?from=a274d62fecfc3f49065f3fcdcb9577637778e0bc&to=d211b22ba1f4ee87faac2fedd5627bbf9f945c01&stat=instructions:u

NewPM-O3:


Benchmark | Old | New
-- | -- | --
kimwitu++ | 49008M | 49638M (+1.29%)
sqlite3 | 49587M | 51559M (+3.98%)
consumer-typeset | 43061M | 44187M (+2.61%)
Bullet | 111829M | 113963M (+1.91%)
tramp3d-v4 | 108415M | 112719M (+3.97%)
mafft | 45181M | 46892M (+3.79%)
ClamAV | 69184M | 71392M (+3.19%)
lencod | 84530M | 88648M (+4.87%)
SPASS | 55491M | 56501M (+1.82%)
7zip | 224276M | 227877M (+1.61%)
geomean | 72783M | 74892M (+2.90%)

rotateright · 2022-12-14T19:10:23Z

Adding GVN does show some small-ish improvements on other regression tests, so it really is just a question of cost:

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index f545fb4f11be..2e6607b07d46 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1178,7 +1178,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   }
   // Enhance/cleanup vector code.
   FPM.addPass(VectorCombinePass());
-  FPM.addPass(GVNPass());
+  if (Level != OptimizationLevel::O1)
+    FPM.addPass(GVNPass());
 
   if (!IsFullLTO) {
     FPM.addPass(InstCombinePass());
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
index 171fe16acb10..a6bec494d9b6 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
@@ -51,12 +51,9 @@ define i32 @main() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[INPUT_RELOAD_ADDR13_I]] to i32*
 ; CHECK-NEXT:    [[N_VAL3_RELOAD_ADDR11_I:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[N_VAL3_RELOAD_ADDR11_I]] to i32*
-; CHECK-NEXT:    [[N_VAL3_RELOAD12_I:%.*]] = load i32, i32* [[TMP4]], align 4, !noalias !3
-; CHECK-NEXT:    [[SUM7_I:%.*]] = add i32 [[N_VAL3_RELOAD12_I]], 2
-; CHECK-NEXT:    store i32 [[SUM7_I]], i32* [[TMP4]], align 4, !noalias !3
+; CHECK-NEXT:    store i32 3, i32* [[TMP4]], align 4, !noalias !3
 ; CHECK-NEXT:    store i32 4, i32* [[TMP3]], align 4, !noalias !3
-; CHECK-NEXT:    [[SUM7_I7:%.*]] = add i32 [[N_VAL3_RELOAD12_I]], 6
-; CHECK-NEXT:    tail call void @print(i32 [[SUM7_I7]]), !noalias !6
+; CHECK-NEXT:    tail call void @print(i32 7), !noalias !6
 ; CHECK-NEXT:    tail call void @deallocate(i8* [[TMP0]]), !noalias !6
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 6a4ed2cd056a..dfa95aafde1a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1406,7 +1406,7 @@ define i32 @disabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; O3DEFAULT-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_44]], align 4
 ; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], ptr [[ARRAYIDX2_44]], align 4
-; O3DEFAULT-NEXT:    [[TMP26:%.*]] = load i32, ptr [[A]], align 4
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
 ; O3DEFAULT-NEXT:    ret i32 [[TMP26]]
 ;
 ; Os-LABEL: @disabled(
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 7cbce461a492..d8fb2e4ca8c5 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -21,31 +21,27 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[N_VEC]], 1
 ; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <8 x i16>, ptr [[NEXT_GEP14]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i16> [[WIDE_LOAD15]] to <8 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw <8 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = ashr <8 x i32> [[TMP8]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP9]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <8 x i32> [[TMP10]] to <8 x i16>
-; CHECK-NEXT:    store <8 x i16> [[TMP11]], ptr [[NEXT_GEP13]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP5]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i32> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP7]], ptr [[NEXT_GEP13]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER16]]
@@ -61,15 +57,15 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER16]] ]
 ; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER16]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_ADDR_05]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 ; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i16, ptr [[PSRCB_ADDR_03]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PSRCB_ADDR_03]], align 2
-; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[PSRCB_ADDR_03]], align 2
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP10]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
-; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[TMP15]] to i16
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
+; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[TMP11]] to i16
 ; CHECK-NEXT:    [[INCDEC_PTR4]] = getelementptr inbounds i16, ptr [[PDST_ADDR_04]], i32 1
 ; CHECK-NEXT:    store i16 [[CONV3]], ptr [[PDST_ADDR_04]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_06]], -1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index 11b6f72793db..d934c080965e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -29,12 +29,6 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT9]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT11]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT13]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -16
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
@@ -44,58 +38,58 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK:       vector.ph.new:
 ; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], -2
 ; CHECK-NEXT:    [[TMP5:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP13]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP17]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP19]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6]], [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x double> [[TMP21]], ptr [[TMP25]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP22]], ptr [[TMP27]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP23]], ptr [[TMP29]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 12
-; CHECK-NEXT:    store <4 x double> [[TMP24]], ptr [[TMP31]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP14]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP16]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6]], [[TMP6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP7]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
+; CHECK-NEXT:    store <4 x double> [[TMP17]], ptr [[TMP21]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 4
+; CHECK-NEXT:    store <4 x double> [[TMP18]], ptr [[TMP22]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 8
+; CHECK-NEXT:    store <4 x double> [[TMP19]], ptr [[TMP23]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP20]], ptr [[TMP24]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_NEXT]]
-; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x double>, ptr [[TMP33]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6_1:%.*]] = load <4 x double>, ptr [[TMP35]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7_1:%.*]] = load <4 x double>, ptr [[TMP37]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD8_1:%.*]] = load <4 x double>, ptr [[TMP39]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP41:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_1]], [[TMP9]]
-; CHECK-NEXT:    [[TMP42:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6_1]], [[TMP10]]
-; CHECK-NEXT:    [[TMP43:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7_1]], [[TMP11]]
-; CHECK-NEXT:    [[TMP44:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8_1]], [[TMP12]]
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_NEXT]]
-; CHECK-NEXT:    store <4 x double> [[TMP41]], ptr [[TMP45]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP42]], ptr [[TMP47]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP43]], ptr [[TMP49]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 12
-; CHECK-NEXT:    store <4 x double> [[TMP44]], ptr [[TMP51]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x double>, ptr [[TMP25]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD6_1:%.*]] = load <4 x double>, ptr [[TMP26]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD7_1:%.*]] = load <4 x double>, ptr [[TMP27]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD8_1:%.*]] = load <4 x double>, ptr [[TMP28]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6_1]], [[TMP10]]
+; CHECK-NEXT:    [[TMP31:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7_1]], [[TMP11]]
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8_1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    store <4 x double> [[TMP29]], ptr [[TMP33]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 4
+; CHECK-NEXT:    store <4 x double> [[TMP30]], ptr [[TMP34]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 8
+; CHECK-NEXT:    store <4 x double> [[TMP31]], ptr [[TMP35]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP32]], ptr [[TMP36]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT_1]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
 ; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
@@ -105,115 +99,115 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
 ; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]]
 ; CHECK:       vector.body.epil:
-; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_UNR]]
-; CHECK-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <4 x double>, ptr [[TMP53]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds double, ptr [[TMP53]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6_EPIL:%.*]] = load <4 x double>, ptr [[TMP55]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds double, ptr [[TMP53]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7_EPIL:%.*]] = load <4 x double>, ptr [[TMP57]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds double, ptr [[TMP53]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD8_EPIL:%.*]] = load <4 x double>, ptr [[TMP59]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP61:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD_EPIL]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP62:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD6_EPIL]], [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP63:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD7_EPIL]], [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP64:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD8_EPIL]], [[BROADCAST_SPLAT14]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_UNR]]
-; CHECK-NEXT:    store <4 x double> [[TMP61]], ptr [[TMP65]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds double, ptr [[TMP65]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP62]], ptr [[TMP67]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds double, ptr [[TMP65]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP63]], ptr [[TMP69]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds double, ptr [[TMP65]], i64 12
-; CHECK-NEXT:    store <4 x double> [[TMP64]], ptr [[TMP71]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_UNR]]
+; CHECK-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <4 x double>, ptr [[TMP37]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD6_EPIL:%.*]] = load <4 x double>, ptr [[TMP38]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD7_EPIL:%.*]] = load <4 x double>, ptr [[TMP39]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD8_EPIL:%.*]] = load <4 x double>, ptr [[TMP40]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP41:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP42:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD6_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP43:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD7_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD8_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_UNR]]
+; CHECK-NEXT:    store <4 x double> [[TMP41]], ptr [[TMP45]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 4
+; CHECK-NEXT:    store <4 x double> [[TMP42]], ptr [[TMP46]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 8
+; CHECK-NEXT:    store <4 x double> [[TMP43]], ptr [[TMP47]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP44]], ptr [[TMP48]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15]]
 ; CHECK:       for.body.preheader15:
 ; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[TMP73:%.*]] = xor i64 [[INDVARS_IV_PH]], -1
-; CHECK-NEXT:    [[TMP74:%.*]] = add nsw i64 [[TMP73]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[TMP49:%.*]] = xor i64 [[INDVARS_IV_PH]], -1
+; CHECK-NEXT:    [[TMP50:%.*]] = add nsw i64 [[TMP49]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    [[XTRAITER16:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
 ; CHECK-NEXT:    [[LCMP_MOD17_NOT:%.*]] = icmp eq i64 [[XTRAITER16]], 0
 ; CHECK-NEXT:    br i1 [[LCMP_MOD17_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]]
 ; CHECK:       for.body.prol.preheader:
-; CHECK-NEXT:    [[TMP75:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    br label [[FOR_BODY_PROL:%.*]]
 ; CHECK:       for.body.prol:
 ; CHECK-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ]
 ; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], [[FOR_BODY_PROL]] ], [ 0, [[FOR_BODY_PROL_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_PROL]]
 ; CHECK-NEXT:    [[T0_PROL:%.*]] = load double, ptr [[ARRAYIDX_PROL]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP76:%.*]] = fmul fast double [[T0_PROL]], [[TMP75]]
+; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast double [[T0_PROL]], [[TMP51]]
 ; CHECK-NEXT:    [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_PROL]]
-; CHECK-NEXT:    store double [[TMP76]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP52]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
 ; CHECK-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER16]]
 ; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.body.prol.loopexit:
 ; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER15]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ]
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp ult i64 [[TMP74]], 7
-; CHECK-NEXT:    br i1 [[TMP77]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15_NEW:%.*]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp ult i64 [[TMP50]], 7
+; CHECK-NEXT:    br i1 [[TMP53]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15_NEW:%.*]]
 ; CHECK:       for.body.preheader15.new:
-; CHECK-NEXT:    [[TMP78:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP79:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP80:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP81:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP82:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP83:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP84:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP85:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP54:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP55:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP56:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP57:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP58:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP59:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP60:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP61:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER15_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP86:%.*]] = fmul fast double [[T0]], [[TMP78]]
+; CHECK-NEXT:    [[TMP62:%.*]] = fmul fast double [[T0]], [[TMP54]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store double [[TMP86]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP62]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[T0_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast double [[T0_1]], [[TMP79]]
+; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast double [[T0_1]], [[TMP55]]
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    store double [[TMP87]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP63]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-NEXT:    [[T0_2:%.*]] = load double, ptr [[ARRAYIDX_2]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP88:%.*]] = fmul fast double [[T0_2]], [[TMP80]]
+; CHECK-NEXT:    [[TMP64:%.*]] = fmul fast double [[T0_2]], [[TMP56]]
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_1]]
-; CHECK-NEXT:    store double [[TMP88]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP64]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-NEXT:    [[T0_3:%.*]] = load double, ptr [[ARRAYIDX_3]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast double [[T0_3]], [[TMP81]]
+; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast double [[T0_3]], [[TMP57]]
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_2]]
-; CHECK-NEXT:    store double [[TMP89]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP65]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_3]]
 ; CHECK-NEXT:    [[T0_4:%.*]] = load double, ptr [[ARRAYIDX_4]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP90:%.*]] = fmul fast double [[T0_4]], [[TMP82]]
+; CHECK-NEXT:    [[TMP66:%.*]] = fmul fast double [[T0_4]], [[TMP58]]
 ; CHECK-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_3]]
-; CHECK-NEXT:    store double [[TMP90]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP66]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_4]]
 ; CHECK-NEXT:    [[T0_5:%.*]] = load double, ptr [[ARRAYIDX_5]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP91:%.*]] = fmul fast double [[T0_5]], [[TMP83]]
+; CHECK-NEXT:    [[TMP67:%.*]] = fmul fast double [[T0_5]], [[TMP59]]
 ; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_4]]
-; CHECK-NEXT:    store double [[TMP91]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP67]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_5]]
 ; CHECK-NEXT:    [[T0_6:%.*]] = load double, ptr [[ARRAYIDX_6]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP92:%.*]] = fmul fast double [[T0_6]], [[TMP84]]
+; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast double [[T0_6]], [[TMP60]]
 ; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_5]]
-; CHECK-NEXT:    store double [[TMP92]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP68]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_6]]
 ; CHECK-NEXT:    [[T0_7:%.*]] = load double, ptr [[ARRAYIDX_7]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP93:%.*]] = fmul fast double [[T0_7]], [[TMP85]]
+; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast double [[T0_7]], [[TMP61]]
 ; CHECK-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_6]]
-; CHECK-NEXT:    store double [[TMP93]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP69]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 77cbc70ff369..334405b12e65 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -47,24 +47,18 @@ define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull a
 define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 {
 ; SSE-LABEL: @ConvertVectors_ByVal(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
-; SSE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
-; SSE-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
-; SSE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; SSE-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
-; SSE-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
+; SSE-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[V:%.*]], align 16
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; SSE-NEXT:    [[VECINIT16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[BC]], <4 x i32> <i32 0, i32 1, i32 6, i32 6>
 ; SSE-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByVal(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
-; AVX-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
-; AVX-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
-; AVX-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; AVX-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
-; AVX-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
+; AVX-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[V:%.*]], align 16
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; AVX-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; AVX-NEXT:    [[VECINIT16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[BC]], <4 x i32> <i32 0, i32 1, i32 6, i32 6>
 ; AVX-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 entry:

RKSimon · 2022-12-16T16:43:27Z

Alternatively - is there anything we can add/move before the GVN pass to improve its chances?

rotateright · 2023-01-31T21:12:46Z

I've been searching for something that could allow us to get the necessary transforms without adding a pass, and I can't see anything.

The existing GVN + InstCombine makes the pattern that allows VectorCombine to create a wide load, and only after that load is created, another round of GVN + InstCombine allows combining the loads.

Also note that the GVN transform has been flagged as not poison-safe, so there's a chance that we'll lose the optimization on the first example. :(

RKSimon · 2023-05-12T12:21:21Z

CC @nikic - this was one of the tickets I mentioned at EuroLLVM regarding possible improvements to FunctionAttr/Attributor/GVN or adding an additional GVN run

llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 9, 2021

EugeneZelenko added llvm:optimizations and removed llvm:codegen labels Nov 17, 2022

rotateright added a commit that referenced this issue Nov 17, 2022

[PhaseOrdering] add test for load combining; NFC

3682180

Based on an example in issue #17113

rotateright added a commit that referenced this issue Nov 28, 2022

[PhaseOrdering] add test for vector load combining; NFC

c7bd82d

This is another example from issue #17113

EugeneZelenko added llvm:instcombine and removed llvm:optimizations labels Nov 29, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Failure to simplify SIMD vector conversion. #17113

Failure to simplify SIMD vector conversion. #17113

silvasean commented Jul 29, 2013 •

edited by RKSimon

Loading

RKSimon commented Mar 30, 2017 •

edited

Loading

rotateright commented Mar 23, 2018

RKSimon commented Dec 7, 2018

RKSimon commented Sep 29, 2022 •

edited

Loading

RKSimon commented Nov 16, 2022 •

edited

Loading

RKSimon commented Nov 16, 2022 •

edited

Loading

rotateright commented Nov 16, 2022

rotateright commented Nov 17, 2022

RKSimon commented Nov 17, 2022

rotateright commented Nov 18, 2022

rotateright commented Nov 19, 2022

RKSimon commented Nov 21, 2022 •

edited

Loading

rotateright commented Nov 21, 2022

RKSimon commented Nov 21, 2022

rotateright commented Nov 21, 2022

RKSimon commented Nov 21, 2022

rotateright commented Nov 22, 2022

rotateright commented Nov 29, 2022

rotateright commented Dec 13, 2022

rotateright commented Dec 14, 2022

rotateright commented Dec 14, 2022 •

edited

Loading

RKSimon commented Dec 16, 2022

rotateright commented Jan 31, 2023

RKSimon commented May 12, 2023

Failure to simplify SIMD vector conversion. #17113

Failure to simplify SIMD vector conversion. #17113

Comments

silvasean commented Jul 29, 2013 • edited by RKSimon Loading

Extended Description

RKSimon commented Mar 30, 2017 • edited Loading

rotateright commented Mar 23, 2018

RKSimon commented Dec 7, 2018

RKSimon commented Sep 29, 2022 • edited Loading

RKSimon commented Nov 16, 2022 • edited Loading

RKSimon commented Nov 16, 2022 • edited Loading

rotateright commented Nov 16, 2022

rotateright commented Nov 17, 2022

RKSimon commented Nov 17, 2022

rotateright commented Nov 18, 2022

rotateright commented Nov 19, 2022

RKSimon commented Nov 21, 2022 • edited Loading

rotateright commented Nov 21, 2022

RKSimon commented Nov 21, 2022

rotateright commented Nov 21, 2022

RKSimon commented Nov 21, 2022

rotateright commented Nov 22, 2022

rotateright commented Nov 29, 2022

rotateright commented Dec 13, 2022

rotateright commented Dec 14, 2022

rotateright commented Dec 14, 2022 • edited Loading

RKSimon commented Dec 16, 2022

rotateright commented Jan 31, 2023

RKSimon commented May 12, 2023

silvasean commented Jul 29, 2013 •

edited by RKSimon

Loading

RKSimon commented Mar 30, 2017 •

edited

Loading

RKSimon commented Sep 29, 2022 •

edited

Loading

RKSimon commented Nov 16, 2022 •

edited

Loading

RKSimon commented Nov 16, 2022 •

edited

Loading

RKSimon commented Nov 21, 2022 •

edited

Loading

rotateright commented Dec 14, 2022 •

edited

Loading