Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Failure to simplify SIMD vector conversion. #17113

Open
silvasean opened this issue Jul 29, 2013 · 24 comments
Open

Failure to simplify SIMD vector conversion. #17113

silvasean opened this issue Jul 29, 2013 · 24 comments
Labels
bugzilla Issues migrated from bugzilla llvm:instcombine

Comments

@silvasean
Copy link
Contributor

silvasean commented Jul 29, 2013

Bugzilla Link 16739
Version trunk
OS All
CC @filcab,@LebedevRI,@RKSimon,@rotateright
Fixed by commit(s) r366441

Extended Description

This code was reduced from a function that converted between SIMD vector classes used by two different libraries; the source and destination vectors have a <4 x float> underlying storage, but notionally hold only {x, y, z} (and the destination duplicates z into the last lane; the source leaves it undefined I think).

typedef float __m128 __attribute__((__vector_size__(16)));
union ElementWiseAccess {
  ElementWiseAccess(__m128 v) : ReprM128(v) {}
  __m128 ReprM128;
  float ReprFloatArray[4];
  float getAt(int i) const { return ReprFloatArray[i]; }
};
// Making this return `const ElementWiseAccess` instead of `const ElementWiseAccess &`
// still results in a failure to optimize, but in a different way.
static const ElementWiseAccess &castToElementWiseAccess(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}
__m128 ConvertVectors(const __m128 &V) {
  // Replacing `castToElementWiseAccess` with directly calling
  // `ElementWiseAccess` makes the issue go away.
  return (__m128) { castToElementWiseAccess(V).getAt(0), //
                    castToElementWiseAccess(V).getAt(1), //
                    castToElementWiseAccess(V).getAt(2), //
                    castToElementWiseAccess(V).getAt(2) };
}

clang -O3 produces:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = bitcast <4 x float>* %V to [4 x float]*
  %2 = getelementptr inbounds <4 x float>* %V, i64 0, i64 0
  %3 = load float* %2, align 4, !tbaa !0
  %4 = insertelement <4 x float> undef, float %3, i32 0
  %5 = getelementptr inbounds [4 x float]* %1, i64 0, i64 1
  %6 = load float* %5, align 4, !tbaa !0
  %7 = insertelement <4 x float> %4, float %6, i32 1
  %8 = getelementptr inbounds [4 x float]* %1, i64 0, i64 2
  %9 = load float* %8, align 4, !tbaa !0
  %10 = insertelement <4 x float> %7, float %9, i32 2
  %11 = insertelement <4 x float> %10, float %9, i32 3
  ret <4 x float> %11
}

It appears that something is interfering with folding the load/insertelement sequence into a vector load + shufflevector.

Making the modification indicated in the comments of having castToElementWiseAccess return by value instead of by reference results in:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = bitcast <4 x float>* %V to i8*
  %2 = bitcast <4 x float>* %V to double*
  %3 = load double* %2, align 16
  %4 = getelementptr inbounds i8* %1, i64 8
  %5 = bitcast i8* %4 to double*
  %6 = bitcast double %3 to i64
  %trunc = trunc i64 %6 to i32
  %bitcast = bitcast i32 %trunc to float
  %7 = insertelement <4 x float> undef, float %bitcast, i32 0
  %8 = lshr i64 %6, 32
  %9 = trunc i64 %8 to i32
  %10 = bitcast i32 %9 to float
  %11 = insertelement <4 x float> %7, float %10, i32 1
  %12 = load double* %5, align 8
  %13 = bitcast double %12 to i64
  %trunc6 = trunc i64 %13 to i32
  %bitcast7 = bitcast i32 %trunc6 to float
  %14 = insertelement <4 x float> %11, float %bitcast7, i32 2
  %15 = insertelement <4 x float> %14, float %bitcast7, i32 3
  ret <4 x float> %15
}

The issue in this case seems to be that clang lowers castToElementWiseAccess as returning {double, double}, which then prevents a <4 x float> load being generated.

Making the modification of replacing the call to castToElementWiseAcess with directly invoking the constructor (e.g. ElementWiseAccess(V).getAt(<<<n>>>)) results in the following code, which is the desired codegen for the initial test case:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = load <4 x float>* %V, align 16, !tbaa !0
  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %2
}
@RKSimon
Copy link
Collaborator

RKSimon commented Mar 30, 2017

Trunk still has issues with this, and it reminds me of [Bug #​21780]:

typedef float __m128 __attribute__((__vector_size__(16)));
union ElementWiseAccess {
  ElementWiseAccess(__m128 v) : ReprM128(v) {}
  __m128 ReprM128;
  float ReprFloatArray[4];
  float getAt(int i) const { return ReprFloatArray[i]; }
};

static const ElementWiseAccess &castToElementWiseAccess_ByRef(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}
static const ElementWiseAccess castToElementWiseAccess_ByVal(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}

__m128 ConvertVectors_ByRef(const __m128 &V) {
  return (__m128) { castToElementWiseAccess_ByRef(V).getAt(0), //
                    castToElementWiseAccess_ByRef(V).getAt(1), //
                    castToElementWiseAccess_ByRef(V).getAt(2), //
                    castToElementWiseAccess_ByRef(V).getAt(2) };
}
__m128 ConvertVectors_ByVal(const __m128 &V) {
  return (__m128) { castToElementWiseAccess_ByVal(V).getAt(0), //
                    castToElementWiseAccess_ByVal(V).getAt(1), //
                    castToElementWiseAccess_ByVal(V).getAt(2), //
                    castToElementWiseAccess_ByVal(V).getAt(2) };
}
__m128 ConvertVectors_ByCopy(const __m128 &V) {
  return (__m128) { ElementWiseAccess(V).getAt(0), //
                    ElementWiseAccess(V).getAt(1), //
                    ElementWiseAccess(V).getAt(2), //
                    ElementWiseAccess(V).getAt(2) };
}

Looking at the IR, it knows that the entire vector load is dereferencable, but still makes a mess of combining the inserted loads:

define <4 x float> @ConvertVectors_ByRef(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = bitcast <4 x float>* %0 to [4 x float]*
  %3 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
  %4 = load float, float* %3, align 4, !tbaa !1
  %5 = insertelement <4 x float> undef, float %4, i32 0
  %6 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
  %7 = load float, float* %6, align 4, !tbaa !1
  %8 = insertelement <4 x float> %5, float %7, i32 1
  %9 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
  %10 = load float, float* %9, align 4, !tbaa !1
  %11 = insertelement <4 x float> %8, float %10, i32 2
  %12 = insertelement <4 x float> %11, float %10, i32 3
  ret <4 x float> %12
}

define <4 x float> @ConvertVectors_ByVal(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = bitcast <4 x float>* %0 to i64*
  %3 = load i64, i64* %2, align 16
  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
  %5 = trunc i64 %3 to i32
  %6 = bitcast i32 %5 to float
  %7 = insertelement <4 x float> undef, float %6, i32 0
  %8 = lshr i64 %3, 32
  %9 = trunc i64 %8 to i32
  %10 = bitcast i32 %9 to float
  %11 = insertelement <4 x float> %7, float %10, i32 1
  %12 = bitcast float* %4 to i64*
  %13 = load i64, i64* %12, align 8
  %14 = trunc i64 %13 to i32
  %15 = bitcast i32 %14 to float
  %16 = insertelement <4 x float> %11, float %15, i32 2
  %17 = insertelement <4 x float> %16, float %15, i32 3
  ret <4 x float> %17
}

define <4 x float> @ConvertVectors_ByCopy(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = load <4 x float>, <4 x float>* %0, align 16, !tbaa !5
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %3
}

Resulting in final assembly:

ConvertVectors_ByRef(float __vector(4) const&):        # @ConvertVectors_ByRef(float __vector(4) const&)
        vmovss  8(%rdi), %xmm0          # xmm0 = mem[0],zero,zero,zero
        vmovsd  (%rdi), %xmm1           # xmm1 = mem[0],zero
        vshufps $4, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0,1],xmm0[0,0]
        retq

ConvertVectors_ByVal(float __vector(4) const&):        # @ConvertVectors_ByVal(float __vector(4) const&)
        vmovss  (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
        vmovss  8(%rdi), %xmm1          # xmm1 = mem[0],zero,zero,zero
        vinsertps       $16, 4(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3]
        vshufps $4, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,0]
        retq

ConvertVectors_ByCopy(float __vector(4) const&):       # @ConvertVectors_ByCopy(float __vector(4) const&)
        vpermilps       $164, (%rdi), %xmm0 # xmm0 = mem[0,1,2,2]
        retq

@rotateright
Copy link
Contributor

For the original example, the bitcast from vector to array might be interfering with subsequent transforms, so:
https://reviews.llvm.org/D44833

@RKSimon
Copy link
Collaborator

RKSimon commented Dec 7, 2018

@llvmbot llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 9, 2021
@RKSimon
Copy link
Collaborator

RKSimon commented Sep 29, 2022

Current IR:

define <4 x float> @ConvertVectors_ByRef(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %0) {
  %2 = load <2 x float>, ptr %0, align 16
  %3 = shufflevector <2 x float> %2, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
  %4 = getelementptr inbounds [4 x float], ptr %0, i64 0, i64 2
  %5 = load float, ptr %4, align 8
  %6 = insertelement <4 x float> %3, float %5, i64 2
  %7 = insertelement <4 x float> %6, float %5, i64 3
  ret <4 x float> %7
}

define <4 x float> @ConvertVectors_ByVal(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %0) {
  %2 = load i64, ptr %0, align 16
  %3 = getelementptr i8, ptr %0, i64 8
  %4 = load i64, ptr %3, align 8
  %5 = trunc i64 %2 to i32
  %6 = insertelement <4 x i32> undef, i32 %5, i64 0
  %7 = lshr i64 %2, 32
  %8 = trunc i64 %7 to i32
  %9 = insertelement <4 x i32> %6, i32 %8, i64 1
  %10 = trunc i64 %4 to i32
  %11 = insertelement <4 x i32> %9, i32 %10, i64 2
  %12 = insertelement <4 x i32> %11, i32 %10, i64 3
  %13 = bitcast <4 x i32> %12 to <4 x float>
  ret <4 x float> %13
}

define <4 x float> @ConvertVectors_ByCopy(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %0) {
  %2 = load <4 x float>, ptr %0, align 16
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %3
}

rotateright added a commit that referenced this issue Nov 10, 2022
This adapts/copies code from the existing fold that allows
widening of load scalar+insert. It can help in IR because
it removes a shuffle, and the backend can already narrow
loads if that is profitable in codegen.

We might be able to consolidate more of the logic, but
handling this basic pattern should be enough to make a small
difference on one of the motivating examples from issue #17113.
The final goal of combining loads on those patterns is not
solved though.

Differential Revision: https://reviews.llvm.org/D137341
@RKSimon
Copy link
Collaborator

RKSimon commented Nov 16, 2022

Current IR:

define <4 x float> @ConvertVectors_ByRef(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %0 = load <4 x float>, ptr %V, align 16
  %arrayidx.i14 = getelementptr inbounds [4 x float], ptr %V, i64 0, i64 2
  %1 = load float, ptr %arrayidx.i14, align 8
  %vecinit7 = insertelement <4 x float> %0, float %1, i64 2
  %vecinit10 = insertelement <4 x float> %vecinit7, float %1, i64 3
  ret <4 x float> %vecinit10
}

define <4 x float> @ConvertVectors_ByVal(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %V.val2536 = load i64, ptr %V, align 16
  %0 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %0, align 8
  %1 = trunc i64 %V.val2536 to i32
  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
  %3 = lshr i64 %V.val2536, 32
  %4 = trunc i64 %3 to i32
  %5 = insertelement <4 x i32> %2, i32 %4, i64 1
  %6 = trunc i64 %V.val2637 to i32
  %7 = insertelement <4 x i32> %5, i32 %6, i64 2
  %8 = insertelement <4 x i32> %7, i32 %6, i64 3
  %vecinit16 = bitcast <4 x i32> %8 to <4 x float>
  ret <4 x float> %vecinit16
}

define <4 x float> @_Z21ConvertVectors_ByCopy(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %0 = load <4 x float>, ptr %V, align 16
  %vecinit9 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %vecinit9
}

@RKSimon
Copy link
Collaborator

RKSimon commented Nov 16, 2022

ConvertVectors_ByRef IR has improved, and if we run it through opt -O3 again we get:

define <4 x float> @ConvertVectors_ByRef(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
  %0 = load <4 x float>, ptr %V, align 16
  %vecinit10 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %vecinit10
}

@rotateright
Copy link
Contributor

ConvertVectors_ByRef IR has improved, and if we run it through opt -O3 again we get:

b57819e got us the wide load, so now GVN + instcombine can give us the ideal IR.

So now there's a tough question: should we run GVN again post-vectorization (IIUC, it's not cheap in compile-time), should there be a lighter-weight pass that specializes in load combining, or can we get away with some kind of load combining enhancement in SDAG?

@rotateright
Copy link
Contributor

I missed another possibility: we can add an early run of vector-combine.

We already do it (!), but it is defaulted off with a debug flag, and that invocation only tries a subset of folds.
That was added last year:
https://reviews.llvm.org/D102496
...and it had a similar motivation - it was specifically added before GVN because that made a difference for matrix ops.

@RKSimon
Copy link
Collaborator

RKSimon commented Nov 17, 2022

SGTM - I suppose always enabling it comes down to whether it will have a detrimental effect on compile time?

rotateright added a commit that referenced this issue Nov 17, 2022
@rotateright
Copy link
Contributor

Compile-time improvement for vector-combine:
87debdadaf18
...hoping that buys enough goodwill to run it a 2nd time :)

@rotateright
Copy link
Contributor

Enable early vector-combine proposal:
https://reviews.llvm.org/D138353

@RKSimon
Copy link
Collaborator

RKSimon commented Nov 21, 2022

define <4 x float> @ConvertVectors_ByVal(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %V.val2536 = load i64, ptr %V, align 16
  %0 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %0, align 8
  %1 = trunc i64 %V.val2536 to i32
  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
  %3 = lshr i64 %V.val2536, 32
  %4 = trunc i64 %3 to i32
  %5 = insertelement <4 x i32> %2, i32 %4, i64 1
  %6 = trunc i64 %V.val2637 to i32
  %7 = insertelement <4 x i32> %5, i32 %6, i64 2
  %8 = insertelement <4 x i32> %7, i32 %6, i64 3
  %vecinit16 = bitcast <4 x i32> %8 to <4 x float>
  ret <4 x float> %vecinit16
} 

I'm not sure what the best way to tackle this one is - should we consider trying harder to vectorize the truncs to something like:

define <4 x float> @ConvertVectors_ByVal_trunc(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %V.val2536 = load i64, ptr %V, align 16
  %0 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %0, align 8
  %1 = insertelement <4 x i64> undef, i64 %V.val2536, i64 0
  %2 = insertelement <4 x i64> %1, i64 %V.val2536, i64 1
  %3 = insertelement <4 x i64> %2, i64 %V.val2637, i64 2
  %4 = insertelement <4 x i64> %3, i64 %V.val2637, i64 3
  %5 = lshr <4 x i64> %4, <i64 0, i64 32, i64 0, i64 0>
  %6 = trunc <4 x i64> %5 to <4 x i32>
  %vecinit16 = bitcast <4 x i32> %6 to <4 x float>
  ret <4 x float> %vecinit16
} 

@rotateright
Copy link
Contributor

The "ByVal" part of this shows a series of potential missed canonicalizations. I think we want to replace the truncs and shifts with bitcasts and shuffles.

That leads to trade-offs like this:
https://alive2.llvm.org/ce/z/dLqHXU
Ie, we can replace a trunc + insert with a bitcast alone, but then we've lost information that the high elements are undef/poison. Is that better or worse than bitcast + identity shuffle with undef elements in the mask? Note that this will also need adjustments for endian.

If we can do those transforms, then we can hopefully convert the load to a vector load (easier now that we have opaque ptr?). Once that happens, we're almost back to the "ByRef" version of the IR, so then it should fold similarly...

@RKSimon
Copy link
Collaborator

RKSimon commented Nov 21, 2022

Ie, we can replace a trunc + insert with a bitcast alone, but then we've lost information that the high elements are undef/poison. Is that better or worse than bitcast + identity shuffle with undef elements in the mask? Note that this will also need adjustments for endian.

I suppose if we can guarantee that we are populating the entire vector then the undef/poison issue is benign.

rotateright added a commit that referenced this issue Nov 21, 2022
The option was added with https://reviews.llvm.org/D102496,
and currently the name is accurate, but I am hoping to add
a load transform that is not a scalarization. See issue #17113.
rotateright added a commit that referenced this issue Nov 21, 2022
An early run of VectorCombine was added with D102496 specifically to
deal with unnecessary vector ops produced with the C matrix extension.
This patch is proposing to try those folds in general and add a pair
of load folds to the menu.

The load transform will partly solve (see PhaseOrdering diffs) a
longstanding vectorization perf bug by removing redundant loads via GVN:
issue #17113

The main reason for not enabling the extra pass generally in the initial
patch was compile-time cost. The cost of VectorCombine was significantly
(surprisingly) improved with:
87debda
https://llvm-compile-time-tracker.com/compare.php?from=ffe05b8f57d97bc4340f791cb386c8d00e0739f2&to=87debdadaf18f8a5c7e5d563889e10731dc3554d&stat=instructions:u

...so the extra run is going to cost very little now - the total cost of
the 2 runs should be less than the 1 run before that micro-optimization:
https://llvm-compile-time-tracker.com/compare.php?from=5e8c2026d10e8e2c93c038c776853bed0e7c8fc1&to=2c4b68eab5ae969811f422714e0eba44c5f7eefb&stat=instructions:u

It may be possible to reduce the cost slightly more with a few more
earlier-exits like that, but it's probably in the noise based on timing
experiments.

Differential Revision: https://reviews.llvm.org/D138353
@rotateright
Copy link
Contributor

I suppose if we can guarantee that we are populating the entire vector then the undef/poison issue is benign.

Right - although thinking about this example a bit more, we're going to need an identity shuffle either way because the vector (128-bit) is larger than the load (64-bit).

Maybe we can enhance VectorCombine to widen the load first by peeking through the trunc.

I suspect there would be cost problems with trying to increase the vector width to 256-bit as shown in your code example.

@RKSimon
Copy link
Collaborator

RKSimon commented Nov 21, 2022

I suspect there would be cost problems with trying to increase the vector width to 256-bit as shown in your code example.

Yes I'd expect the SLP should get that, so if it doesn't then it might not be profitable.

@rotateright
Copy link
Contributor

Another micro-optimization for VectorCombine:
ede6d608f4
...that should definitely make it faster to run twice now than it took to run once before.

rotateright added a commit that referenced this issue Nov 28, 2022
@rotateright
Copy link
Contributor

Proposal to canonicalize towards bitcast + shuffle (away from trunc + shift) - it's split into 3 pieces to reduce risk and ease review, but I consider it one fold:
https://reviews.llvm.org/D138872
https://reviews.llvm.org/D138873
https://reviews.llvm.org/D138874

There's an annoying "lack-of-CSE" artifact in the test tracking this bug. We might be able to bend instcombine to deal with that rather than go down another pipeline-altering rabbit hole. That would allow forming a vector load.
Then, we're still left with a final scalar load that GVN is not recognizing as overlapping. Either the load needs to be narrowed or GVN needs to be enhanced to see that.

rotateright added a commit that referenced this issue Dec 12, 2022
…ment

This replaces patches that tried to convert related patterns to shuffles
(D138872, D138873, D138874 - reverted/abandoned) but caused codegen
problems and were questionable as a canonicalization because an
insertelement is a simpler op than a shuffle.

This detects a larger pattern -- insert-of-insert -- and replaces with
another insert, so this hopefully does not cause any problems.

As noted by TODO items in the code and tests, this could go a lot further.
But this is enough to reduce the motivating test from issue #17113.

Example proofs:
https://alive2.llvm.org/ce/z/NnUv3a

I drafted a version of this for AggressiveInstCombine, but it seems that
would uncover yet another phase ordering gap. If we do generalize this to
handle the full range of potential patterns, that may be worth looking at
again.

Differential Revision: https://reviews.llvm.org/D139668
@rotateright
Copy link
Contributor

We're down to this form with 2 loads after combining the 1st two inserts:

define noundef <4 x float> @_Z20ConvertVectors_ByValRKDv4_f(ptr align 16 dereferenceable(16) %V) {
entry:
  %0 = load <4 x float>, ptr %V, align 16
  %1 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %1, align 8, !tbaa.struct !6
  %2 = trunc i64 %V.val2637 to i32
  %3 = bitcast i32 %2 to float
  %vecinit11 = insertelement <4 x float> %0, float %3, i64 2
  %vecinit16 = insertelement <4 x float> %vecinit11, float %3, i64 3
  ret <4 x float> %vecinit16
}

And GVN + instcombine is able to reduce that to a single load + shuffle, so we have another phase ordering problem.

@rotateright
Copy link
Contributor

I think we've really hit the hard problem of phase ordering -- #17113 (comment) -- that we managed to sidestep on the previous example.

I did confirm that just adding a GVN run later in the pipeline is very costly in compile-time:
https://llvm-compile-time-tracker.com/compare.php?from=a274d62fecfc3f49065f3fcdcb9577637778e0bc&to=d211b22ba1f4ee87faac2fedd5627bbf9f945c01&stat=instructions:u

NewPM-O3:


Benchmark | Old | New
-- | -- | --
kimwitu++ | 49008M | 49638M (+1.29%)
sqlite3 | 49587M | 51559M (+3.98%)
consumer-typeset | 43061M | 44187M (+2.61%)
Bullet | 111829M | 113963M (+1.91%)
tramp3d-v4 | 108415M | 112719M (+3.97%)
mafft | 45181M | 46892M (+3.79%)
ClamAV | 69184M | 71392M (+3.19%)
lencod | 84530M | 88648M (+4.87%)
SPASS | 55491M | 56501M (+1.82%)
7zip | 224276M | 227877M (+1.61%)
geomean | 72783M | 74892M (+2.90%)


@rotateright
Copy link
Contributor

rotateright commented Dec 14, 2022

Adding GVN does show some small-ish improvements on other regression tests, so it really is just a question of cost:

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index f545fb4f11be..2e6607b07d46 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1178,7 +1178,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   }
   // Enhance/cleanup vector code.
   FPM.addPass(VectorCombinePass());
-  FPM.addPass(GVNPass());
+  if (Level != OptimizationLevel::O1)
+    FPM.addPass(GVNPass());
 
   if (!IsFullLTO) {
     FPM.addPass(InstCombinePass());
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
index 171fe16acb10..a6bec494d9b6 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
@@ -51,12 +51,9 @@ define i32 @main() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[INPUT_RELOAD_ADDR13_I]] to i32*
 ; CHECK-NEXT:    [[N_VAL3_RELOAD_ADDR11_I:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[N_VAL3_RELOAD_ADDR11_I]] to i32*
-; CHECK-NEXT:    [[N_VAL3_RELOAD12_I:%.*]] = load i32, i32* [[TMP4]], align 4, !noalias !3
-; CHECK-NEXT:    [[SUM7_I:%.*]] = add i32 [[N_VAL3_RELOAD12_I]], 2
-; CHECK-NEXT:    store i32 [[SUM7_I]], i32* [[TMP4]], align 4, !noalias !3
+; CHECK-NEXT:    store i32 3, i32* [[TMP4]], align 4, !noalias !3
 ; CHECK-NEXT:    store i32 4, i32* [[TMP3]], align 4, !noalias !3
-; CHECK-NEXT:    [[SUM7_I7:%.*]] = add i32 [[N_VAL3_RELOAD12_I]], 6
-; CHECK-NEXT:    tail call void @print(i32 [[SUM7_I7]]), !noalias !6
+; CHECK-NEXT:    tail call void @print(i32 7), !noalias !6
 ; CHECK-NEXT:    tail call void @deallocate(i8* [[TMP0]]), !noalias !6
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 6a4ed2cd056a..dfa95aafde1a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1406,7 +1406,7 @@ define i32 @disabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; O3DEFAULT-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_44]], align 4
 ; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], ptr [[ARRAYIDX2_44]], align 4
-; O3DEFAULT-NEXT:    [[TMP26:%.*]] = load i32, ptr [[A]], align 4
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
 ; O3DEFAULT-NEXT:    ret i32 [[TMP26]]
 ;
 ; Os-LABEL: @disabled(
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 7cbce461a492..d8fb2e4ca8c5 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -21,31 +21,27 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[N_VEC]], 1
 ; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <8 x i16>, ptr [[NEXT_GEP14]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i16> [[WIDE_LOAD15]] to <8 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw <8 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = ashr <8 x i32> [[TMP8]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP9]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <8 x i32> [[TMP10]] to <8 x i16>
-; CHECK-NEXT:    store <8 x i16> [[TMP11]], ptr [[NEXT_GEP13]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP5]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i32> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP7]], ptr [[NEXT_GEP13]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER16]]
@@ -61,15 +57,15 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER16]] ]
 ; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER16]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_ADDR_05]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 ; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i16, ptr [[PSRCB_ADDR_03]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PSRCB_ADDR_03]], align 2
-; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[PSRCB_ADDR_03]], align 2
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP10]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
-; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[TMP15]] to i16
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
+; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[TMP11]] to i16
 ; CHECK-NEXT:    [[INCDEC_PTR4]] = getelementptr inbounds i16, ptr [[PDST_ADDR_04]], i32 1
 ; CHECK-NEXT:    store i16 [[CONV3]], ptr [[PDST_ADDR_04]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_06]], -1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index 11b6f72793db..d934c080965e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -29,12 +29,6 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT9]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT11]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT13]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -16
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
@@ -44,58 +38,58 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK:       vector.ph.new:
 ; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], -2
 ; CHECK-NEXT:    [[TMP5:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP13]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP17]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP19]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6]], [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x double> [[TMP21]], ptr [[TMP25]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP22]], ptr [[TMP27]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP23]], ptr [[TMP29]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 12
-; CHECK-NEXT:    store <4 x double> [[TMP24]], ptr [[TMP31]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP14]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP16]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6]], [[TMP6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP7]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
+; CHECK-NEXT:    store <4 x double> [[TMP17]], ptr [[TMP21]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 4
+; CHECK-NEXT:    store <4 x double> [[TMP18]], ptr [[TMP22]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 8
+; CHECK-NEXT:    store <4 x double> [[TMP19]], ptr [[TMP23]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP20]], ptr [[TMP24]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_NEXT]]
-; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x double>, ptr [[TMP33]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6_1:%.*]] = load <4 x double>, ptr [[TMP35]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7_1:%.*]] = load <4 x double>, ptr [[TMP37]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD8_1:%.*]] = load <4 x double>, ptr [[TMP39]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP41:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_1]], [[TMP9]]
-; CHECK-NEXT:    [[TMP42:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6_1]], [[TMP10]]
-; CHECK-NEXT:    [[TMP43:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7_1]], [[TMP11]]
-; CHECK-NEXT:    [[TMP44:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8_1]], [[TMP12]]
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_NEXT]]
-; CHECK-NEXT:    store <4 x double> [[TMP41]], ptr [[TMP45]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP42]], ptr [[TMP47]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP43]], ptr [[TMP49]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 12
-; CHECK-NEXT:    store <4 x double> [[TMP44]], ptr [[TMP51]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x double>, ptr [[TMP25]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD6_1:%.*]] = load <4 x double>, ptr [[TMP26]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD7_1:%.*]] = load <4 x double>, ptr [[TMP27]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD8_1:%.*]] = load <4 x double>, ptr [[TMP28]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6_1]], [[TMP10]]
+; CHECK-NEXT:    [[TMP31:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7_1]], [[TMP11]]
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8_1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    store <4 x double> [[TMP29]], ptr [[TMP33]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 4
+; CHECK-NEXT:    store <4 x double> [[TMP30]], ptr [[TMP34]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 8
+; CHECK-NEXT:    store <4 x double> [[TMP31]], ptr [[TMP35]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP32]], ptr [[TMP36]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT_1]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
 ; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
@@ -105,115 +99,115 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
 ; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]]
 ; CHECK:       vector.body.epil:
-; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_UNR]]
-; CHECK-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <4 x double>, ptr [[TMP53]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds double, ptr [[TMP53]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6_EPIL:%.*]] = load <4 x double>, ptr [[TMP55]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds double, ptr [[TMP53]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7_EPIL:%.*]] = load <4 x double>, ptr [[TMP57]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds double, ptr [[TMP53]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD8_EPIL:%.*]] = load <4 x double>, ptr [[TMP59]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP61:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD_EPIL]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP62:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD6_EPIL]], [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP63:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD7_EPIL]], [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP64:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD8_EPIL]], [[BROADCAST_SPLAT14]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_UNR]]
-; CHECK-NEXT:    store <4 x double> [[TMP61]], ptr [[TMP65]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds double, ptr [[TMP65]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP62]], ptr [[TMP67]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds double, ptr [[TMP65]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP63]], ptr [[TMP69]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds double, ptr [[TMP65]], i64 12
-; CHECK-NEXT:    store <4 x double> [[TMP64]], ptr [[TMP71]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_UNR]]
+; CHECK-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <4 x double>, ptr [[TMP37]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD6_EPIL:%.*]] = load <4 x double>, ptr [[TMP38]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD7_EPIL:%.*]] = load <4 x double>, ptr [[TMP39]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD8_EPIL:%.*]] = load <4 x double>, ptr [[TMP40]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP41:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP42:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD6_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP43:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD7_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD8_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_UNR]]
+; CHECK-NEXT:    store <4 x double> [[TMP41]], ptr [[TMP45]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 4
+; CHECK-NEXT:    store <4 x double> [[TMP42]], ptr [[TMP46]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 8
+; CHECK-NEXT:    store <4 x double> [[TMP43]], ptr [[TMP47]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP44]], ptr [[TMP48]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15]]
 ; CHECK:       for.body.preheader15:
 ; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[TMP73:%.*]] = xor i64 [[INDVARS_IV_PH]], -1
-; CHECK-NEXT:    [[TMP74:%.*]] = add nsw i64 [[TMP73]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[TMP49:%.*]] = xor i64 [[INDVARS_IV_PH]], -1
+; CHECK-NEXT:    [[TMP50:%.*]] = add nsw i64 [[TMP49]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    [[XTRAITER16:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
 ; CHECK-NEXT:    [[LCMP_MOD17_NOT:%.*]] = icmp eq i64 [[XTRAITER16]], 0
 ; CHECK-NEXT:    br i1 [[LCMP_MOD17_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]]
 ; CHECK:       for.body.prol.preheader:
-; CHECK-NEXT:    [[TMP75:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    br label [[FOR_BODY_PROL:%.*]]
 ; CHECK:       for.body.prol:
 ; CHECK-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ]
 ; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], [[FOR_BODY_PROL]] ], [ 0, [[FOR_BODY_PROL_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_PROL]]
 ; CHECK-NEXT:    [[T0_PROL:%.*]] = load double, ptr [[ARRAYIDX_PROL]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP76:%.*]] = fmul fast double [[T0_PROL]], [[TMP75]]
+; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast double [[T0_PROL]], [[TMP51]]
 ; CHECK-NEXT:    [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_PROL]]
-; CHECK-NEXT:    store double [[TMP76]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP52]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
 ; CHECK-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER16]]
 ; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.body.prol.loopexit:
 ; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER15]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ]
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp ult i64 [[TMP74]], 7
-; CHECK-NEXT:    br i1 [[TMP77]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15_NEW:%.*]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp ult i64 [[TMP50]], 7
+; CHECK-NEXT:    br i1 [[TMP53]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15_NEW:%.*]]
 ; CHECK:       for.body.preheader15.new:
-; CHECK-NEXT:    [[TMP78:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP79:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP80:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP81:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP82:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP83:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP84:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP85:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP54:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP55:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP56:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP57:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP58:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP59:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP60:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP61:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER15_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP86:%.*]] = fmul fast double [[T0]], [[TMP78]]
+; CHECK-NEXT:    [[TMP62:%.*]] = fmul fast double [[T0]], [[TMP54]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store double [[TMP86]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP62]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[T0_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast double [[T0_1]], [[TMP79]]
+; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast double [[T0_1]], [[TMP55]]
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    store double [[TMP87]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP63]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-NEXT:    [[T0_2:%.*]] = load double, ptr [[ARRAYIDX_2]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP88:%.*]] = fmul fast double [[T0_2]], [[TMP80]]
+; CHECK-NEXT:    [[TMP64:%.*]] = fmul fast double [[T0_2]], [[TMP56]]
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_1]]
-; CHECK-NEXT:    store double [[TMP88]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP64]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-NEXT:    [[T0_3:%.*]] = load double, ptr [[ARRAYIDX_3]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast double [[T0_3]], [[TMP81]]
+; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast double [[T0_3]], [[TMP57]]
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_2]]
-; CHECK-NEXT:    store double [[TMP89]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP65]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_3]]
 ; CHECK-NEXT:    [[T0_4:%.*]] = load double, ptr [[ARRAYIDX_4]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP90:%.*]] = fmul fast double [[T0_4]], [[TMP82]]
+; CHECK-NEXT:    [[TMP66:%.*]] = fmul fast double [[T0_4]], [[TMP58]]
 ; CHECK-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_3]]
-; CHECK-NEXT:    store double [[TMP90]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP66]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_4]]
 ; CHECK-NEXT:    [[T0_5:%.*]] = load double, ptr [[ARRAYIDX_5]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP91:%.*]] = fmul fast double [[T0_5]], [[TMP83]]
+; CHECK-NEXT:    [[TMP67:%.*]] = fmul fast double [[T0_5]], [[TMP59]]
 ; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_4]]
-; CHECK-NEXT:    store double [[TMP91]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP67]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_5]]
 ; CHECK-NEXT:    [[T0_6:%.*]] = load double, ptr [[ARRAYIDX_6]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP92:%.*]] = fmul fast double [[T0_6]], [[TMP84]]
+; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast double [[T0_6]], [[TMP60]]
 ; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_5]]
-; CHECK-NEXT:    store double [[TMP92]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP68]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_6]]
 ; CHECK-NEXT:    [[T0_7:%.*]] = load double, ptr [[ARRAYIDX_7]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP93:%.*]] = fmul fast double [[T0_7]], [[TMP85]]
+; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast double [[T0_7]], [[TMP61]]
 ; CHECK-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_6]]
-; CHECK-NEXT:    store double [[TMP93]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP69]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 77cbc70ff369..334405b12e65 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -47,24 +47,18 @@ define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull a
 define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 {
 ; SSE-LABEL: @ConvertVectors_ByVal(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
-; SSE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
-; SSE-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
-; SSE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; SSE-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
-; SSE-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
+; SSE-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[V:%.*]], align 16
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; SSE-NEXT:    [[VECINIT16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[BC]], <4 x i32> <i32 0, i32 1, i32 6, i32 6>
 ; SSE-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByVal(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
-; AVX-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
-; AVX-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
-; AVX-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; AVX-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
-; AVX-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
+; AVX-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[V:%.*]], align 16
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; AVX-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; AVX-NEXT:    [[VECINIT16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[BC]], <4 x i32> <i32 0, i32 1, i32 6, i32 6>
 ; AVX-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 entry:

@RKSimon
Copy link
Collaborator

RKSimon commented Dec 16, 2022

Alternatively - is there anything we can add/move before the GVN pass to improve its chances?

@rotateright
Copy link
Contributor

I've been searching for something that could allow us to get the necessary transforms without adding a pass, and I can't see anything.

The existing GVN + InstCombine makes the pattern that allows VectorCombine to create a wide load, and only after that load is created, another round of GVN + InstCombine allows combining the loads.

Also note that the GVN transform has been flagged as not poison-safe, so there's a chance that we'll lose the optimization on the first example. :(

@RKSimon
Copy link
Collaborator

RKSimon commented May 12, 2023

CC @nikic - this was one of the tickets I mentioned at EuroLLVM regarding possible improvements to FunctionAttr/Attributor/GVN or adding an additional GVN run

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bugzilla Issues migrated from bugzilla llvm:instcombine
Projects
None yet
Development

No branches or pull requests

4 participants