[X86][SSE] Comparison result extractions should use MOVMSK to simplify mask handling #39013

RKSimon · 2018-11-14T21:38:24Z


Bugzilla Link	39665
Version	trunk
OS	Windows NT
CC	@topperc,@dtemirbulatov,@LebedevRI,@RKSimon,@rotateright

## Extended Description 
define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
; SSE42-LABEL: load_v2f64_v2i64:
; SSE42:       ## %bb.0:
; SSE42-NEXT:    pxor %xmm2, %xmm2
; SSE42-NEXT:    pcmpeqq %xmm0, %xmm2
; SSE42-NEXT:    pextrb $0, %xmm2, %eax
; SSE42-NEXT:    testb $1, %al
; SSE42-NEXT:    je LBB1_2
; SSE42-NEXT:  ## %bb.1: ## %cond.load
; SSE42-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
; SSE42-NEXT:  LBB1_2: ## %else
; SSE42-NEXT:    pextrb $8, %xmm2, %eax
; SSE42-NEXT:    testb $1, %al
; SSE42-NEXT:    je LBB1_4
; SSE42-NEXT:  ## %bb.3: ## %cond.load1
; SSE42-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; SSE42-NEXT:  LBB1_4: ## %else2
; SSE42-NEXT:    movapd %xmm1, %xmm0
; SSE42-NEXT:    retq
  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
  ret <2 x double> %res
}
declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)

This can replace all the pextrb with a single movmsk to something like:

  pxor %xmm2, %xmm2
  pcmpeqq %xmm0, %xmm2
  movmskpd %xmm2, %eax
  testb $1, %al
  jne LBB1_2
## %bb.1: ## %cond.load
  movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
LBB1_2: ## %else
  testb $2, %al
  jne LBB1_4
## %bb.3: ## %cond.load1
  movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
LBB1_4: ## %else2
  movapd %xmm1, %xmm0
  retq

The text was updated successfully, but these errors were encountered:

RKSimon · 2019-03-08T13:43:57Z

Generalizing this bug, as its not just masked memory ops that suffer:

https://gcc.godbolt.org/z/6qjhy4

define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind {
  %mask1 = fcmp oeq <4 x float> %x, %y
  %mask = extractelement <4 x i1> %mask1, i64 0
  %s = extractelement <4 x float> %x, i64 0
  %call = tail call float @llvm.floor.f32(float %s)
  %dst = extractelement <4 x float> %w, i64 0
  %low = select i1 %mask, float %call, float %dst
  %res = insertelement <4 x float> %y, float %low, i64 0
  ret <4 x float> %res
}
declare float @llvm.floor.f32(float %s)

llc -mcpu=btver2

floor_mask_ss_mask8:
  movaps %xmm0, %xmm3
  cmpeqps %xmm1, %xmm3
  pextrb $0, %xmm3, %eax
  testb $1, %al
  je .LBB0_2
  xorps %xmm2, %xmm2
  roundss $9, %xmm0, %xmm2
.LBB0_2:
  blendps $1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[1,2,3]
  movaps %xmm1, %xmm0
  retq

RKSimon · 2019-03-08T18:02:52Z

Some simpler cases: https://godbolt.org/z/sy5EUA

define i32 @cmp1(<2 x double> %a0) {
  %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
  %b = extractelement <2 x i1> %a, i32 0
  %c = select i1 %b, i32 42, i32 99
  ret i32 %c
}

define i32 @cmp2(<2 x double> %a0) {
  %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
  %b = extractelement <2 x i1> %a, i32 0
  %c = extractelement <2 x i1> %a, i32 1
  %d = and i1 %b, %c
  %e = select i1 %d, i32 42, i32 99
  ret i32 %e
}

rotateright · 2019-03-08T18:10:46Z

Some simpler cases: https://godbolt.org/z/sy5EUA

define i32 @cmp1(<2 x double> %a0) {
  %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
  %b = extractelement <2 x i1> %a, i32 0
  %c = select i1 %b, i32 42, i32 99
  ret i32 %c
}

This might be a simple extension of:
https://reviews.llvm.org/D58282

https://godbolt.org/z/U3elbr

rotateright · 2019-03-08T18:15:44Z

Some simpler cases: https://godbolt.org/z/sy5EUA

define i32 @cmp1(<2 x double> %a0) {
%a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
%b = extractelement <2 x i1> %a, i32 0
%c = select i1 %b, i32 42, i32 99
ret i32 %c
}

This might be a simple extension of:
https://reviews.llvm.org/D58282

https://godbolt.org/z/U3elbr

We do have that transform in IR already. Do we need it in codegen too?

rotateright · 2019-03-08T19:32:57Z

Some simpler cases: https://godbolt.org/z/sy5EUA

define i32 @cmp1(<2 x double> %a0) {
%a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
%b = extractelement <2 x i1> %a, i32 0
%c = select i1 %b, i32 42, i32 99
ret i32 %c
}

This might be a simple extension of:
https://reviews.llvm.org/D58282

https://godbolt.org/z/U3elbr

We do have that transform in IR already. Do we need it in codegen too?

And the answer is 'yes' if I'm seeing it correctly. We can't hoist the extract in IR in the more general case in comment 1 because both operands are variables. That requires knowing that extract of element 0 of an FP vector is free.

rotateright · 2019-03-08T21:57:50Z

And the answer is 'yes' if I'm seeing it correctly. We can't hoist the
extract in IR in the more general case in comment 1 because both operands
are variables. That requires knowing that extract of element 0 of an FP
vector is free.

https://reviews.llvm.org/rL355741

RKSimon · 2019-03-18T11:25:39Z

We may be able to do some of this in SLP by recognising these as allof/anyof reductions of boolean results.

rotateright · 2019-03-20T20:32:24Z

Generalizing this bug, as its not just masked memory ops that suffer:

https://gcc.godbolt.org/z/6qjhy4

define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x
float> %w) nounwind {
%mask1 = fcmp oeq <4 x float> %x, %y
%mask = extractelement <4 x i1> %mask1, i64 0
%s = extractelement <4 x float> %x, i64 0
%call = tail call float @llvm.floor.f32(float %s)
%dst = extractelement <4 x float> %w, i64 0
%low = select i1 %mask, float %call, float %dst
%res = insertelement <4 x float> %y, float %low, i64 0
ret <4 x float> %res
}
declare float @llvm.floor.f32(float %s)

llc -mcpu=btver2

floor_mask_ss_mask8: # @floor_mask_ss_mask8
movaps %xmm0, %xmm3
cmpeqps %xmm1, %xmm3
pextrb $0, %xmm3, %eax
testb $1, %al
je .LBB0_2
xorps %xmm2, %xmm2
roundss $9, %xmm0, %xmm2
.LBB0_2:
blendps $1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[1,2,3]
movaps %xmm1, %xmm0
retq

We're scalarizing now...so current codegen for an AVX target:
vroundss $9, %xmm0, %xmm0, %xmm3
vcmpeqss %xmm1, %xmm0, %xmm0
vblendvps %xmm0, %xmm3, %xmm2, %xmm0
vblendps $1, %xmm0, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1,2,3]

rotateright · 2019-03-20T20:34:56Z

So the question for the comment 1 example is similar to bug 41145 - should we prefer blendv or branches?

rotateright · 2019-03-20T20:39:37Z

And codegen for the example in the description is now:
vpxor %xmm2, %xmm2, %xmm2
vpcmpeqq %xmm2, %xmm0, %xmm0
vmaskmovpd (%rdi), %xmm0, %xmm2
vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
retq

So I think it's the same question: bitwise select or branch?

RKSimon · 2019-03-20T21:39:08Z

And codegen for the example in the description is now:
vpxor %xmm2, %xmm2, %xmm2
vpcmpeqq %xmm2, %xmm0, %xmm0
vmaskmovpd (%rdi), %xmm0, %xmm2
vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
retq

So I think it's the same question: bitwise select or branch?

That's for AVX1 targets, the original example was for SSE42 - but v8i16/v16i8 tests on AVX1 has similar issues.

LebedevRI · 2019-03-21T18:04:29Z

And codegen for the example in the description is now:
vpxor %xmm2, %xmm2, %xmm2
vpcmpeqq %xmm2, %xmm0, %xmm0
vmaskmovpd (%rdi), %xmm0, %xmm2
vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
retq

So I think it's the same question: bitwise select or branch?

That's for AVX1 targets, the original example was for SSE42 - but
v8i16/v16i8 tests on AVX1 has similar issues.

I suspect the answer in general is:
"if it can be bitwise select, it probably should be"

RKSimon · 2019-03-21T20:10:48Z

And codegen for the example in the description is now:
vpxor %xmm2, %xmm2, %xmm2
vpcmpeqq %xmm2, %xmm0, %xmm0
vmaskmovpd (%rdi), %xmm0, %xmm2
vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
retq

So I think it's the same question: bitwise select or branch?

That's for AVX1 targets, the original example was for SSE42 - but
v8i16/v16i8 tests on AVX1 has similar issues.

I suspect the answer in general is:
"if it can be bitwise select, it probably should be"

Maybe for loads - but for that to work we need to guarantee that the entire range is dereferencable - I'm not sure if masked loads can/should currently fold to regular loads in that case?

rotateright · 2019-03-25T17:51:25Z

For x86, I think we have to know/prove that the ends of the masked load are actual loads in order to convert the whole thing to a regular load.

http://lists.llvm.org/pipermail/llvm-dev/2016-April/097927.html

RKSimon · 2022-04-06T10:08:25Z

Resolving - we do a good job of using movmsk now

llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 10, 2021

RKSimon closed this as completed Apr 6, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86][SSE] Comparison result extractions should use MOVMSK to simplify mask handling #39013

[X86][SSE] Comparison result extractions should use MOVMSK to simplify mask handling #39013

RKSimon commented Nov 14, 2018 •

edited

Loading

RKSimon commented Mar 8, 2019 •

edited

Loading

RKSimon commented Mar 8, 2019 •

edited

Loading

rotateright commented Mar 8, 2019 •

edited by RKSimon

Loading

rotateright commented Mar 8, 2019

rotateright commented Mar 8, 2019

rotateright commented Mar 8, 2019

RKSimon commented Mar 18, 2019

rotateright commented Mar 20, 2019

rotateright commented Mar 20, 2019

rotateright commented Mar 20, 2019

RKSimon commented Mar 20, 2019

LebedevRI commented Mar 21, 2019

RKSimon commented Mar 21, 2019

rotateright commented Mar 25, 2019

RKSimon commented Apr 6, 2022

[X86][SSE] Comparison result extractions should use MOVMSK to simplify mask handling #39013

[X86][SSE] Comparison result extractions should use MOVMSK to simplify mask handling #39013

Comments

RKSimon commented Nov 14, 2018 • edited Loading

RKSimon commented Mar 8, 2019 • edited Loading

RKSimon commented Mar 8, 2019 • edited Loading

rotateright commented Mar 8, 2019 • edited by RKSimon Loading

rotateright commented Mar 8, 2019

rotateright commented Mar 8, 2019

rotateright commented Mar 8, 2019

RKSimon commented Mar 18, 2019

rotateright commented Mar 20, 2019

rotateright commented Mar 20, 2019

rotateright commented Mar 20, 2019

RKSimon commented Mar 20, 2019

LebedevRI commented Mar 21, 2019

RKSimon commented Mar 21, 2019

rotateright commented Mar 25, 2019

RKSimon commented Apr 6, 2022

RKSimon commented Nov 14, 2018 •

edited

Loading

RKSimon commented Mar 8, 2019 •

edited

Loading

RKSimon commented Mar 8, 2019 •

edited

Loading

rotateright commented Mar 8, 2019 •

edited by RKSimon

Loading