41892 – [X86][SSE] Failure to merge 2 x float stores to <2 x float> store

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 41892 - [X86][SSE] Failure to merge 2 x float stores to <2 x float> store

Summary: [X86][SSE] Failure to merge 2 x float stores to <2 x float> store

Status:	NEW

Alias:	None

Product:	libraries
Classification:	Unclassified
Component:	Backend: X86 (show other bugs)
Version:	trunk
Hardware:	PC Windows NT

Importance:	P enhancement
Assignee:	Unassigned LLVM Bugs

URL:
Keywords:

Depends on:
Blocks:

Reported:	2019-05-15 14:44 PDT by Simon Pilgrim
Modified:	2019-12-21 23:53 PST (History)
CC List:	5 users (show)

See Also:	42022
Fixed By Commit(s):

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Simon Pilgrim 2019-05-15 14:44:42 PDT

https://godbolt.org/z/InfZZK

#include <x86intrin.h>

void sum_pairs_128(__m128 f, float *p) {
  p[0] = f[0] + f[1];
  p[1] = f[2] + f[3];
}


clang -O3 -march=btver2

_Z13sum_pairs_128Dv4_fPf: # @_Z13sum_pairs_128Dv4_fPf
  vhaddps %xmm0, %xmm0, %xmm0
  vmovss %xmm0, (%rdi)
  vextractps $1, %xmm0, 4(%rdi)
  retq

would be better if we managed to merge to:

_Z13sum_pairs_128Dv4_fPf: # @_Z13sum_pairs_128Dv4_fPf
  vhaddps %xmm0, %xmm0, %xmm0
  vmovsd %xmm0, (%rdi)
  retq

Comment 1 Simon Pilgrim 2019-05-17 08:48:42 PDT

float3 cases would be useful as well: https://godbolt.org/z/DRs8Qf

#include <x86intrin.h>

void store_float3(__m128 f, float *p) {
  *p++ = f[0];
  *p++ = f[1];
  *p++ = f[2];
}

void store_float3_splat(__m128 f, float *p) {
  *p++ = f[0];
  *p++ = f[0];
  *p++ = f[0];
}

_Z12store_float3Dv4_fPf: # @_Z12store_float3Dv4_fPf
  vmovss %xmm0, (%rdi)
  vextractps $1, %xmm0, 4(%rdi)
  vextractps $2, %xmm0, 8(%rdi)
  retq
_Z18store_float3_splatDv4_fPf: # @_Z18store_float3_splatDv4_fPf
  vmovss %xmm0, (%rdi)
  vmovss %xmm0, 4(%rdi)
  vmovss %xmm0, 8(%rdi)
  retq

which could be:

_Z12store_float3Dv4_fPf: # @_Z12store_float3Dv4_fPf
  vmovsd %xmm0, (%rdi)
  vextractps $2, %xmm0, 8(%rdi)
  retq
_Z18store_float3_splatDv4_fPf: # @_Z18store_float3_splatDv4_fPf
  vpermilps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0]
  vmovsd %xmm0, (%rdi)
  movss xmm0, 8(%rdi)
  retq