typedef int v4si __attribute__((vector_size(16))); typedef float v4sf __attribute__((vector_size(16))); v4sf f(v4si f) { return (v4sf){(float)f[1], (float)f[1], (float)f[2], (float)f[3]}; } With -O3, GCC outputs this: f(int __vector(4)): pshufd xmm0, xmm0, 229 cvtdq2ps xmm0, xmm0 ret LLVM outputs this: f(int __vector(4)): pshufd xmm1, xmm0, 85 # xmm1 = xmm0[1,1,1,1] cvtdq2ps xmm1, xmm1 pshufd xmm2, xmm0, 238 # xmm2 = xmm0[2,3,2,3] cvtdq2ps xmm2, xmm2 pshufd xmm0, xmm0, 255 # xmm0 = xmm0[3,3,3,3] cvtdq2ps xmm0, xmm0 unpcklps xmm2, xmm0 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] shufps xmm1, xmm2, 64 # xmm1 = xmm1[0,0],xmm2[0,1] movaps xmm0, xmm1 ret
Current Codegen: https://godbolt.org/z/oYn1df Probably an SLP issue? [Bug #35732] looks very similar
This issue is to be fixed by http://reviews.llvm.org/D57059 after committing (in process of review now). We have conversion for f[1], f[2] and f[3] here, for three fps, so I have checked patch "Initial support for the vectorization of the non-power-of-2 vectors" fits here: > ./opt -slp-vectorizer -instcombine -S pr49081.ll ... define dso_local <4 x float> @foo(<4 x i32> %0) { %shuffle = shufflevector <4 x i32> %0, <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef> %2 = sitofp <4 x i32> %shuffle to <4 x float> %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 2> ret <4 x float> %3 }
Commited llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll to track fixing.
(In reply to Anton Afanasyev from comment #2) > This issue is to be fixed by http://reviews.llvm.org/D57059 after committing > (in process of review now). We have conversion for f[1], f[2] and f[3] here, > for three fps, so I have checked patch "Initial support for the > vectorization of the non-power-of-2 vectors" fits here: > > > ./opt -slp-vectorizer -instcombine -S pr49081.ll > ... > define dso_local <4 x float> @foo(<4 x i32> %0) { > %shuffle = shufflevector <4 x i32> %0, <4 x i32> poison, <4 x i32> <i32 1, > i32 2, i32 3, i32 undef> > %2 = sitofp <4 x i32> %shuffle to <4 x float> > %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, > i32 0, i32 1, i32 2> > ret <4 x float> %3 > } For reference, that sequence was not optimizing in IR or backend, so added an instcombine transform to make it easier for SDAG: https://reviews.llvm.org/rG0bab0f616119