39665 – [X86][SSE] Comparison result extractions should use MOVMSK to simplify mask handling

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 39665 - [X86][SSE] Comparison result extractions should use MOVMSK to simplify mask handling

Summary: [X86][SSE] Comparison result extractions should use MOVMSK to simplify mask h...

Status:	NEW

Alias:	None

Product:	libraries
Classification:	Unclassified
Component:	Backend: X86 (show other bugs)
Version:	trunk
Hardware:	PC Windows NT

Importance:	P enhancement
Assignee:	Unassigned LLVM Bugs

URL:
Keywords:

Depends on:
Blocks:

Reported:	2018-11-14 13:38 PST by Simon Pilgrim
Modified:	2020-01-26 06:17 PST (History)
CC List:	6 users (show)

See Also:	39666 39668 34563 41145 43745 44565
Fixed By Commit(s):

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Simon Pilgrim 2018-11-14 13:38:24 PST

define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
; SSE42-LABEL: load_v2f64_v2i64:
; SSE42:       ## %bb.0:
; SSE42-NEXT:    pxor %xmm2, %xmm2
; SSE42-NEXT:    pcmpeqq %xmm0, %xmm2
; SSE42-NEXT:    pextrb $0, %xmm2, %eax
; SSE42-NEXT:    testb $1, %al
; SSE42-NEXT:    je LBB1_2
; SSE42-NEXT:  ## %bb.1: ## %cond.load
; SSE42-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
; SSE42-NEXT:  LBB1_2: ## %else
; SSE42-NEXT:    pextrb $8, %xmm2, %eax
; SSE42-NEXT:    testb $1, %al
; SSE42-NEXT:    je LBB1_4
; SSE42-NEXT:  ## %bb.3: ## %cond.load1
; SSE42-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; SSE42-NEXT:  LBB1_4: ## %else2
; SSE42-NEXT:    movapd %xmm1, %xmm0
; SSE42-NEXT:    retq
  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
  ret <2 x double> %res
}
declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)

This can replace all the pextrb with a single movmsk to something like:

  pxor %xmm2, %xmm2
  pcmpeqq %xmm0, %xmm2
  movmskpd %xmm2, %eax
  testb $1, %al
  jne LBB1_2
## %bb.1: ## %cond.load
  movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
LBB1_2: ## %else
  testb $2, %al
  jne LBB1_4
## %bb.3: ## %cond.load1
  movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
LBB1_4: ## %else2
  movapd %xmm1, %xmm0
  retq

Comment 1 Simon Pilgrim 2019-03-08 05:43:57 PST

Generalizing this bug, as its not just masked memory ops that suffer:

https://gcc.godbolt.org/z/6qjhy4

define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind {
  %mask1 = fcmp oeq <4 x float> %x, %y
  %mask = extractelement <4 x i1> %mask1, i64 0
  %s = extractelement <4 x float> %x, i64 0
  %call = tail call float @llvm.floor.f32(float %s)
  %dst = extractelement <4 x float> %w, i64 0
  %low = select i1 %mask, float %call, float %dst
  %res = insertelement <4 x float> %y, float %low, i64 0
  ret <4 x float> %res
}
declare float @llvm.floor.f32(float %s)

llc -mcpu=btver2

floor_mask_ss_mask8: # @floor_mask_ss_mask8
  movaps %xmm0, %xmm3
  cmpeqps %xmm1, %xmm3
  pextrb $0, %xmm3, %eax
  testb $1, %al
  je .LBB0_2
  xorps %xmm2, %xmm2
  roundss $9, %xmm0, %xmm2
.LBB0_2:
  blendps $1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[1,2,3]
  movaps %xmm1, %xmm0
  retq

Comment 2 Simon Pilgrim 2019-03-08 10:02:52 PST

Some simpler cases: https://godbolt.org/z/sy5EUA

define i32 @cmp1(<2 x double> %a0) {
  %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
  %b = extractelement <2 x i1> %a, i32 0
  %c = select i1 %b, i32 42, i32 99
  ret i32 %c
}

define i32 @cmp2(<2 x double> %a0) {
  %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
  %b = extractelement <2 x i1> %a, i32 0
  %c = extractelement <2 x i1> %a, i32 1
  %d = and i1 %b, %c
  %e = select i1 %d, i32 42, i32 99
  ret i32 %e
}

Comment 3 Sanjay Patel 2019-03-08 10:10:46 PST

(In reply to Simon Pilgrim from comment #2)
> Some simpler cases: https://godbolt.org/z/sy5EUA
> 
> define i32 @cmp1(<2 x double> %a0) {
>   %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
>   %b = extractelement <2 x i1> %a, i32 0
>   %c = select i1 %b, i32 42, i32 99
>   ret i32 %c
> }

This might be a simple extension of:
https://reviews.llvm.org/D58282

https://godbolt.org/z/U3elbr

Comment 4 Sanjay Patel 2019-03-08 10:15:44 PST

(In reply to Sanjay Patel from comment #3)
> (In reply to Simon Pilgrim from comment #2)
> > Some simpler cases: https://godbolt.org/z/sy5EUA
> > 
> > define i32 @cmp1(<2 x double> %a0) {
> >   %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
> >   %b = extractelement <2 x i1> %a, i32 0
> >   %c = select i1 %b, i32 42, i32 99
> >   ret i32 %c
> > }
> 
> This might be a simple extension of:
> https://reviews.llvm.org/D58282
> 
> https://godbolt.org/z/U3elbr

We do have that transform in IR already. Do we need it in codegen too?

Comment 5 Sanjay Patel 2019-03-08 11:32:57 PST

(In reply to Sanjay Patel from comment #4)
> (In reply to Sanjay Patel from comment #3)
> > (In reply to Simon Pilgrim from comment #2)
> > > Some simpler cases: https://godbolt.org/z/sy5EUA
> > > 
> > > define i32 @cmp1(<2 x double> %a0) {
> > >   %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00>
> > >   %b = extractelement <2 x i1> %a, i32 0
> > >   %c = select i1 %b, i32 42, i32 99
> > >   ret i32 %c
> > > }
> > 
> > This might be a simple extension of:
> > https://reviews.llvm.org/D58282
> > 
> > https://godbolt.org/z/U3elbr
> 
> We do have that transform in IR already. Do we need it in codegen too?

And the answer is 'yes' if I'm seeing it correctly. We can't hoist the extract in IR in the more general case in comment 1 because both operands are variables. That requires knowing that extract of element 0 of an FP vector is free.

Comment 6 Sanjay Patel 2019-03-08 13:57:50 PST

(In reply to Sanjay Patel from comment #5)
> And the answer is 'yes' if I'm seeing it correctly. We can't hoist the
> extract in IR in the more general case in comment 1 because both operands
> are variables. That requires knowing that extract of element 0 of an FP
> vector is free.

https://reviews.llvm.org/rL355741

Comment 7 Simon Pilgrim 2019-03-18 04:25:39 PDT

We may be able to do some of this in SLP by recognising these as allof/anyof reductions of boolean results.

Comment 8 Sanjay Patel 2019-03-20 13:32:24 PDT

(In reply to Simon Pilgrim from comment #1)
> Generalizing this bug, as its not just masked memory ops that suffer:
> 
> https://gcc.godbolt.org/z/6qjhy4
> 
> define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x
> float> %w) nounwind {
>   %mask1 = fcmp oeq <4 x float> %x, %y
>   %mask = extractelement <4 x i1> %mask1, i64 0
>   %s = extractelement <4 x float> %x, i64 0
>   %call = tail call float @llvm.floor.f32(float %s)
>   %dst = extractelement <4 x float> %w, i64 0
>   %low = select i1 %mask, float %call, float %dst
>   %res = insertelement <4 x float> %y, float %low, i64 0
>   ret <4 x float> %res
> }
> declare float @llvm.floor.f32(float %s)
> 
> llc -mcpu=btver2
> 
> floor_mask_ss_mask8: # @floor_mask_ss_mask8
>   movaps %xmm0, %xmm3
>   cmpeqps %xmm1, %xmm3
>   pextrb $0, %xmm3, %eax
>   testb $1, %al
>   je .LBB0_2
>   xorps %xmm2, %xmm2
>   roundss $9, %xmm0, %xmm2
> .LBB0_2:
>   blendps $1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[1,2,3]
>   movaps %xmm1, %xmm0
>   retq

We're scalarizing now...so current codegen for an AVX target:
  vroundss $9, %xmm0, %xmm0, %xmm3
  vcmpeqss %xmm1, %xmm0, %xmm0
  vblendvps %xmm0, %xmm3, %xmm2, %xmm0
  vblendps $1, %xmm0, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1,2,3]

Comment 9 Sanjay Patel 2019-03-20 13:34:56 PDT

So the question for the comment 1 example is similar to bug 41145 - should we prefer blendv or branches?

Comment 10 Sanjay Patel 2019-03-20 13:39:37 PDT

And codegen for the example in the description is now:
	vpxor	%xmm2, %xmm2, %xmm2
	vpcmpeqq	%xmm2, %xmm0, %xmm0
	vmaskmovpd	(%rdi), %xmm0, %xmm2
	vblendvpd	%xmm0, %xmm2, %xmm1, %xmm0
	retq


So I think it's the same question: bitwise select or branch?

Comment 11 Simon Pilgrim 2019-03-20 14:39:08 PDT

(In reply to Sanjay Patel from comment #10)
> And codegen for the example in the description is now:
> 	vpxor	%xmm2, %xmm2, %xmm2
> 	vpcmpeqq	%xmm2, %xmm0, %xmm0
> 	vmaskmovpd	(%rdi), %xmm0, %xmm2
> 	vblendvpd	%xmm0, %xmm2, %xmm1, %xmm0
> 	retq
> 
> 
> So I think it's the same question: bitwise select or branch?

That's for AVX1 targets, the original example was for SSE42 - but v8i16/v16i8 tests on AVX1 has similar issues.

Comment 12 Roman Lebedev 2019-03-21 11:04:29 PDT

(In reply to Simon Pilgrim from comment #11)
> (In reply to Sanjay Patel from comment #10)
> > And codegen for the example in the description is now:
> > 	vpxor	%xmm2, %xmm2, %xmm2
> > 	vpcmpeqq	%xmm2, %xmm0, %xmm0
> > 	vmaskmovpd	(%rdi), %xmm0, %xmm2
> > 	vblendvpd	%xmm0, %xmm2, %xmm1, %xmm0
> > 	retq
> > 
> > 
> > So I think it's the same question: bitwise select or branch?
> 
> That's for AVX1 targets, the original example was for SSE42 - but
> v8i16/v16i8 tests on AVX1 has similar issues.

I suspect the answer in general is:
"if it can be bitwise select, it probably should be"

Comment 13 Simon Pilgrim 2019-03-21 13:10:48 PDT

(In reply to Roman Lebedev from comment #12)
> (In reply to Simon Pilgrim from comment #11)
> > (In reply to Sanjay Patel from comment #10)
> > > And codegen for the example in the description is now:
> > > 	vpxor	%xmm2, %xmm2, %xmm2
> > > 	vpcmpeqq	%xmm2, %xmm0, %xmm0
> > > 	vmaskmovpd	(%rdi), %xmm0, %xmm2
> > > 	vblendvpd	%xmm0, %xmm2, %xmm1, %xmm0
> > > 	retq
> > > 
> > > 
> > > So I think it's the same question: bitwise select or branch?
> > 
> > That's for AVX1 targets, the original example was for SSE42 - but
> > v8i16/v16i8 tests on AVX1 has similar issues.
> 
> I suspect the answer in general is:
> "if it can be bitwise select, it probably should be"

Maybe for loads - but for that to work we need to guarantee that the entire range is dereferencable - I'm not sure if masked loads can/should currently fold to regular loads in that case?

Comment 14 Sanjay Patel 2019-03-25 10:51:25 PDT

For x86, I think we have to know/prove that the ends of the masked load are actual loads in order to convert the whole thing to a regular load.

http://lists.llvm.org/pipermail/llvm-dev/2016-April/097927.html