16739 – Failure to simplify SIMD vector conversion.

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 16739 - Failure to simplify SIMD vector conversion.

Summary: Failure to simplify SIMD vector conversion.

Status:	NEW

Alias:	None

Product:	libraries
Classification:	Unclassified
Component:	Common Code Generator Code (show other bugs)
Version:	trunk
Hardware:	PC All

Importance:	P normal
Assignee:	Unassigned LLVM Bugs

URL:
Keywords:

Depends on:
Blocks:

Reported:	2013-07-29 16:28 PDT by Sean Silva
Modified:	2019-09-07 05:46 PDT (History)
CC List:	5 users (show)

See Also:	21780 42424 42550
Fixed By Commit(s):	r366441

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Sean Silva 2013-07-29 16:28:08 PDT

This code was reduced from a function that converted between SIMD vector classes used by two different libraries; the source and destination vectors have a <4 x float> underlying storage, but notionally hold only {x, y, z} (and the destination duplicates z into the last lane; the source leaves it undefined I think).

typedef float __m128 __attribute__((__vector_size__(16)));
union ElementWiseAccess {
  ElementWiseAccess(__m128 v) : ReprM128(v) {}
  __m128 ReprM128;
  float ReprFloatArray[4];
  float getAt(int i) const { return ReprFloatArray[i]; }
};
// Making this return `const ElementWiseAccess` instead of `const ElementWiseAccess &`
// still results in a failure to optimize, but in a different way.
static const ElementWiseAccess &castToElementWiseAccess(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}
__m128 ConvertVectors(const __m128 &V) {
  // Replacing `castToElementWiseAccess` with directly calling
  // `ElementWiseAccess` makes the issue go away.
  return (__m128) { castToElementWiseAccess(V).getAt(0), //
                    castToElementWiseAccess(V).getAt(1), //
                    castToElementWiseAccess(V).getAt(2), //
                    castToElementWiseAccess(V).getAt(2) };
}


clang -O3 produces:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = bitcast <4 x float>* %V to [4 x float]*
  %2 = getelementptr inbounds <4 x float>* %V, i64 0, i64 0
  %3 = load float* %2, align 4, !tbaa !0
  %4 = insertelement <4 x float> undef, float %3, i32 0
  %5 = getelementptr inbounds [4 x float]* %1, i64 0, i64 1
  %6 = load float* %5, align 4, !tbaa !0
  %7 = insertelement <4 x float> %4, float %6, i32 1
  %8 = getelementptr inbounds [4 x float]* %1, i64 0, i64 2
  %9 = load float* %8, align 4, !tbaa !0
  %10 = insertelement <4 x float> %7, float %9, i32 2
  %11 = insertelement <4 x float> %10, float %9, i32 3
  ret <4 x float> %11
}

It appears that something is interfering with folding the load/insertelement sequence into a vector load + shufflevector.


Making the modification indicated in the comments of having `castToElementWiseAccess` return by value instead of by reference results in:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = bitcast <4 x float>* %V to i8*
  %2 = bitcast <4 x float>* %V to double*
  %3 = load double* %2, align 16
  %4 = getelementptr inbounds i8* %1, i64 8
  %5 = bitcast i8* %4 to double*
  %6 = bitcast double %3 to i64
  %trunc = trunc i64 %6 to i32
  %bitcast = bitcast i32 %trunc to float
  %7 = insertelement <4 x float> undef, float %bitcast, i32 0
  %8 = lshr i64 %6, 32
  %9 = trunc i64 %8 to i32
  %10 = bitcast i32 %9 to float
  %11 = insertelement <4 x float> %7, float %10, i32 1
  %12 = load double* %5, align 8
  %13 = bitcast double %12 to i64
  %trunc6 = trunc i64 %13 to i32
  %bitcast7 = bitcast i32 %trunc6 to float
  %14 = insertelement <4 x float> %11, float %bitcast7, i32 2
  %15 = insertelement <4 x float> %14, float %bitcast7, i32 3
  ret <4 x float> %15
}

The issue in this case seems to be that clang lowers `castToElementWiseAccess` as returning `{double, double}`, which then prevents a <4 x float> load being generated.

Making the modification of replacing the call to `castToElementWiseAcess` with directly invoking the constructor (e.g. `ElementWiseAccess(V).getAt(<<<n>>>)`) results in the following code, which is the desired codegen for the initial test case:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = load <4 x float>* %V, align 16, !tbaa !0
  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %2
}

Comment 1 Simon Pilgrim 2017-03-30 08:50:34 PDT

Trunk still has issues with this, and it reminds me of [Bug #21780]:

typedef float __m128 __attribute__((__vector_size__(16)));
union ElementWiseAccess {
  ElementWiseAccess(__m128 v) : ReprM128(v) {}
  __m128 ReprM128;
  float ReprFloatArray[4];
  float getAt(int i) const { return ReprFloatArray[i]; }
};

static const ElementWiseAccess &castToElementWiseAccess_ByRef(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}
static const ElementWiseAccess castToElementWiseAccess_ByVal(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}

__m128 ConvertVectors_ByRef(const __m128 &V) {
  return (__m128) { castToElementWiseAccess_ByRef(V).getAt(0), //
                    castToElementWiseAccess_ByRef(V).getAt(1), //
                    castToElementWiseAccess_ByRef(V).getAt(2), //
                    castToElementWiseAccess_ByRef(V).getAt(2) };
}
__m128 ConvertVectors_ByVal(const __m128 &V) {
  return (__m128) { castToElementWiseAccess_ByVal(V).getAt(0), //
                    castToElementWiseAccess_ByVal(V).getAt(1), //
                    castToElementWiseAccess_ByVal(V).getAt(2), //
                    castToElementWiseAccess_ByVal(V).getAt(2) };
}
__m128 ConvertVectors_ByCopy(const __m128 &V) {
  return (__m128) { ElementWiseAccess(V).getAt(0), //
                    ElementWiseAccess(V).getAt(1), //
                    ElementWiseAccess(V).getAt(2), //
                    ElementWiseAccess(V).getAt(2) };
}

Looking at the IR, it knows that the entire vector load is dereferencable, but still makes a mess of combining the inserted loads:

define <4 x float> @ConvertVectors_ByRef(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = bitcast <4 x float>* %0 to [4 x float]*
  %3 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
  %4 = load float, float* %3, align 4, !tbaa !1
  %5 = insertelement <4 x float> undef, float %4, i32 0
  %6 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
  %7 = load float, float* %6, align 4, !tbaa !1
  %8 = insertelement <4 x float> %5, float %7, i32 1
  %9 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
  %10 = load float, float* %9, align 4, !tbaa !1
  %11 = insertelement <4 x float> %8, float %10, i32 2
  %12 = insertelement <4 x float> %11, float %10, i32 3
  ret <4 x float> %12
}

define <4 x float> @ConvertVectors_ByVal(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = bitcast <4 x float>* %0 to i64*
  %3 = load i64, i64* %2, align 16
  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
  %5 = trunc i64 %3 to i32
  %6 = bitcast i32 %5 to float
  %7 = insertelement <4 x float> undef, float %6, i32 0
  %8 = lshr i64 %3, 32
  %9 = trunc i64 %8 to i32
  %10 = bitcast i32 %9 to float
  %11 = insertelement <4 x float> %7, float %10, i32 1
  %12 = bitcast float* %4 to i64*
  %13 = load i64, i64* %12, align 8
  %14 = trunc i64 %13 to i32
  %15 = bitcast i32 %14 to float
  %16 = insertelement <4 x float> %11, float %15, i32 2
  %17 = insertelement <4 x float> %16, float %15, i32 3
  ret <4 x float> %17
}

define <4 x float> @ConvertVectors_ByCopy(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = load <4 x float>, <4 x float>* %0, align 16, !tbaa !5
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %3
}

Resulting in final assembly:

ConvertVectors_ByRef(float __vector(4) const&):        # @ConvertVectors_ByRef(float __vector(4) const&)
        vmovss  8(%rdi), %xmm0          # xmm0 = mem[0],zero,zero,zero
        vmovsd  (%rdi), %xmm1           # xmm1 = mem[0],zero
        vshufps $4, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0,1],xmm0[0,0]
        retq

ConvertVectors_ByVal(float __vector(4) const&):        # @ConvertVectors_ByVal(float __vector(4) const&)
        vmovss  (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
        vmovss  8(%rdi), %xmm1          # xmm1 = mem[0],zero,zero,zero
        vinsertps       $16, 4(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3]
        vshufps $4, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,0]
        retq

ConvertVectors_ByCopy(float __vector(4) const&):       # @ConvertVectors_ByCopy(float __vector(4) const&)
        vpermilps       $164, (%rdi), %xmm0 # xmm0 = mem[0,1,2,2]
        retq

Comment 2 Sanjay Patel 2018-03-23 10:43:53 PDT

For the original example, the bitcast from vector to array might be interfering with subsequent transforms, so:
https://reviews.llvm.org/D44833

Comment 3 Simon Pilgrim 2018-12-07 08:16:25 PST

https://godbolt.org/z/NlK7rA