LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 41892 - [X86][SSE] Failure to merge 2 x float stores to <2 x float> store
Summary: [X86][SSE] Failure to merge 2 x float stores to <2 x float> store
Status: NEW
Alias: None
Product: libraries
Classification: Unclassified
Component: Backend: X86 (show other bugs)
Version: trunk
Hardware: PC Windows NT
: P enhancement
Assignee: Unassigned LLVM Bugs
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2019-05-15 14:44 PDT by Simon Pilgrim
Modified: 2019-12-21 23:53 PST (History)
5 users (show)

See Also:
Fixed By Commit(s):


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Simon Pilgrim 2019-05-15 14:44:42 PDT
https://godbolt.org/z/InfZZK

#include <x86intrin.h>

void sum_pairs_128(__m128 f, float *p) {
  p[0] = f[0] + f[1];
  p[1] = f[2] + f[3];
}


clang -O3 -march=btver2

_Z13sum_pairs_128Dv4_fPf: # @_Z13sum_pairs_128Dv4_fPf
  vhaddps %xmm0, %xmm0, %xmm0
  vmovss %xmm0, (%rdi)
  vextractps $1, %xmm0, 4(%rdi)
  retq

would be better if we managed to merge to:

_Z13sum_pairs_128Dv4_fPf: # @_Z13sum_pairs_128Dv4_fPf
  vhaddps %xmm0, %xmm0, %xmm0
  vmovsd %xmm0, (%rdi)
  retq
Comment 1 Simon Pilgrim 2019-05-17 08:48:42 PDT
float3 cases would be useful as well: https://godbolt.org/z/DRs8Qf

#include <x86intrin.h>

void store_float3(__m128 f, float *p) {
  *p++ = f[0];
  *p++ = f[1];
  *p++ = f[2];
}

void store_float3_splat(__m128 f, float *p) {
  *p++ = f[0];
  *p++ = f[0];
  *p++ = f[0];
}

_Z12store_float3Dv4_fPf: # @_Z12store_float3Dv4_fPf
  vmovss %xmm0, (%rdi)
  vextractps $1, %xmm0, 4(%rdi)
  vextractps $2, %xmm0, 8(%rdi)
  retq
_Z18store_float3_splatDv4_fPf: # @_Z18store_float3_splatDv4_fPf
  vmovss %xmm0, (%rdi)
  vmovss %xmm0, 4(%rdi)
  vmovss %xmm0, 8(%rdi)
  retq

which could be:

_Z12store_float3Dv4_fPf: # @_Z12store_float3Dv4_fPf
  vmovsd %xmm0, (%rdi)
  vextractps $2, %xmm0, 8(%rdi)
  retq
_Z18store_float3_splatDv4_fPf: # @_Z18store_float3_splatDv4_fPf
  vpermilps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0]
  vmovsd %xmm0, (%rdi)
  movss xmm0, 8(%rdi)
  retq