https://godbolt.org/z/InfZZK #include <x86intrin.h> void sum_pairs_128(__m128 f, float *p) { p[0] = f[0] + f[1]; p[1] = f[2] + f[3]; } clang -O3 -march=btver2 _Z13sum_pairs_128Dv4_fPf: # @_Z13sum_pairs_128Dv4_fPf vhaddps %xmm0, %xmm0, %xmm0 vmovss %xmm0, (%rdi) vextractps $1, %xmm0, 4(%rdi) retq would be better if we managed to merge to: _Z13sum_pairs_128Dv4_fPf: # @_Z13sum_pairs_128Dv4_fPf vhaddps %xmm0, %xmm0, %xmm0 vmovsd %xmm0, (%rdi) retq
float3 cases would be useful as well: https://godbolt.org/z/DRs8Qf #include <x86intrin.h> void store_float3(__m128 f, float *p) { *p++ = f[0]; *p++ = f[1]; *p++ = f[2]; } void store_float3_splat(__m128 f, float *p) { *p++ = f[0]; *p++ = f[0]; *p++ = f[0]; } _Z12store_float3Dv4_fPf: # @_Z12store_float3Dv4_fPf vmovss %xmm0, (%rdi) vextractps $1, %xmm0, 4(%rdi) vextractps $2, %xmm0, 8(%rdi) retq _Z18store_float3_splatDv4_fPf: # @_Z18store_float3_splatDv4_fPf vmovss %xmm0, (%rdi) vmovss %xmm0, 4(%rdi) vmovss %xmm0, 8(%rdi) retq which could be: _Z12store_float3Dv4_fPf: # @_Z12store_float3Dv4_fPf vmovsd %xmm0, (%rdi) vextractps $2, %xmm0, 8(%rdi) retq _Z18store_float3_splatDv4_fPf: # @_Z18store_float3_splatDv4_fPf vpermilps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] vmovsd %xmm0, (%rdi) movss xmm0, 8(%rdi) retq