define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { ; SSE42-LABEL: load_v2f64_v2i64: ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm2 ; SSE42-NEXT: pextrb $0, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load ; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] ; SSE42-NEXT: LBB1_2: ## %else ; SSE42-NEXT: pextrb $8, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 ; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; SSE42-NEXT: LBB1_4: ## %else2 ; SSE42-NEXT: movapd %xmm1, %xmm0 ; SSE42-NEXT: retq %mask = icmp eq <2 x i64> %trigger, zeroinitializer %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) ret <2 x double> %res } declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) This can replace all the pextrb with a single movmsk to something like: pxor %xmm2, %xmm2 pcmpeqq %xmm0, %xmm2 movmskpd %xmm2, %eax testb $1, %al jne LBB1_2 ## %bb.1: ## %cond.load movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] LBB1_2: ## %else testb $2, %al jne LBB1_4 ## %bb.3: ## %cond.load1 movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] LBB1_4: ## %else2 movapd %xmm1, %xmm0 retq
Generalizing this bug, as its not just masked memory ops that suffer: https://gcc.godbolt.org/z/6qjhy4 define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind { %mask1 = fcmp oeq <4 x float> %x, %y %mask = extractelement <4 x i1> %mask1, i64 0 %s = extractelement <4 x float> %x, i64 0 %call = tail call float @llvm.floor.f32(float %s) %dst = extractelement <4 x float> %w, i64 0 %low = select i1 %mask, float %call, float %dst %res = insertelement <4 x float> %y, float %low, i64 0 ret <4 x float> %res } declare float @llvm.floor.f32(float %s) llc -mcpu=btver2 floor_mask_ss_mask8: # @floor_mask_ss_mask8 movaps %xmm0, %xmm3 cmpeqps %xmm1, %xmm3 pextrb $0, %xmm3, %eax testb $1, %al je .LBB0_2 xorps %xmm2, %xmm2 roundss $9, %xmm0, %xmm2 .LBB0_2: blendps $1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[1,2,3] movaps %xmm1, %xmm0 retq
Some simpler cases: https://godbolt.org/z/sy5EUA define i32 @cmp1(<2 x double> %a0) { %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00> %b = extractelement <2 x i1> %a, i32 0 %c = select i1 %b, i32 42, i32 99 ret i32 %c } define i32 @cmp2(<2 x double> %a0) { %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00> %b = extractelement <2 x i1> %a, i32 0 %c = extractelement <2 x i1> %a, i32 1 %d = and i1 %b, %c %e = select i1 %d, i32 42, i32 99 ret i32 %e }
(In reply to Simon Pilgrim from comment #2) > Some simpler cases: https://godbolt.org/z/sy5EUA > > define i32 @cmp1(<2 x double> %a0) { > %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00> > %b = extractelement <2 x i1> %a, i32 0 > %c = select i1 %b, i32 42, i32 99 > ret i32 %c > } This might be a simple extension of: https://reviews.llvm.org/D58282 https://godbolt.org/z/U3elbr
(In reply to Sanjay Patel from comment #3) > (In reply to Simon Pilgrim from comment #2) > > Some simpler cases: https://godbolt.org/z/sy5EUA > > > > define i32 @cmp1(<2 x double> %a0) { > > %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00> > > %b = extractelement <2 x i1> %a, i32 0 > > %c = select i1 %b, i32 42, i32 99 > > ret i32 %c > > } > > This might be a simple extension of: > https://reviews.llvm.org/D58282 > > https://godbolt.org/z/U3elbr We do have that transform in IR already. Do we need it in codegen too?
(In reply to Sanjay Patel from comment #4) > (In reply to Sanjay Patel from comment #3) > > (In reply to Simon Pilgrim from comment #2) > > > Some simpler cases: https://godbolt.org/z/sy5EUA > > > > > > define i32 @cmp1(<2 x double> %a0) { > > > %a = fcmp ogt <2 x double> %a0, <double 1.000000e+00, double 1.000000e+00> > > > %b = extractelement <2 x i1> %a, i32 0 > > > %c = select i1 %b, i32 42, i32 99 > > > ret i32 %c > > > } > > > > This might be a simple extension of: > > https://reviews.llvm.org/D58282 > > > > https://godbolt.org/z/U3elbr > > We do have that transform in IR already. Do we need it in codegen too? And the answer is 'yes' if I'm seeing it correctly. We can't hoist the extract in IR in the more general case in comment 1 because both operands are variables. That requires knowing that extract of element 0 of an FP vector is free.
(In reply to Sanjay Patel from comment #5) > And the answer is 'yes' if I'm seeing it correctly. We can't hoist the > extract in IR in the more general case in comment 1 because both operands > are variables. That requires knowing that extract of element 0 of an FP > vector is free. https://reviews.llvm.org/rL355741
We may be able to do some of this in SLP by recognising these as allof/anyof reductions of boolean results.
(In reply to Simon Pilgrim from comment #1) > Generalizing this bug, as its not just masked memory ops that suffer: > > https://gcc.godbolt.org/z/6qjhy4 > > define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x > float> %w) nounwind { > %mask1 = fcmp oeq <4 x float> %x, %y > %mask = extractelement <4 x i1> %mask1, i64 0 > %s = extractelement <4 x float> %x, i64 0 > %call = tail call float @llvm.floor.f32(float %s) > %dst = extractelement <4 x float> %w, i64 0 > %low = select i1 %mask, float %call, float %dst > %res = insertelement <4 x float> %y, float %low, i64 0 > ret <4 x float> %res > } > declare float @llvm.floor.f32(float %s) > > llc -mcpu=btver2 > > floor_mask_ss_mask8: # @floor_mask_ss_mask8 > movaps %xmm0, %xmm3 > cmpeqps %xmm1, %xmm3 > pextrb $0, %xmm3, %eax > testb $1, %al > je .LBB0_2 > xorps %xmm2, %xmm2 > roundss $9, %xmm0, %xmm2 > .LBB0_2: > blendps $1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[1,2,3] > movaps %xmm1, %xmm0 > retq We're scalarizing now...so current codegen for an AVX target: vroundss $9, %xmm0, %xmm0, %xmm3 vcmpeqss %xmm1, %xmm0, %xmm0 vblendvps %xmm0, %xmm3, %xmm2, %xmm0 vblendps $1, %xmm0, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1,2,3]
So the question for the comment 1 example is similar to bug 41145 - should we prefer blendv or branches?
And codegen for the example in the description is now: vpxor %xmm2, %xmm2, %xmm2 vpcmpeqq %xmm2, %xmm0, %xmm0 vmaskmovpd (%rdi), %xmm0, %xmm2 vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 retq So I think it's the same question: bitwise select or branch?
(In reply to Sanjay Patel from comment #10) > And codegen for the example in the description is now: > vpxor %xmm2, %xmm2, %xmm2 > vpcmpeqq %xmm2, %xmm0, %xmm0 > vmaskmovpd (%rdi), %xmm0, %xmm2 > vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 > retq > > > So I think it's the same question: bitwise select or branch? That's for AVX1 targets, the original example was for SSE42 - but v8i16/v16i8 tests on AVX1 has similar issues.
(In reply to Simon Pilgrim from comment #11) > (In reply to Sanjay Patel from comment #10) > > And codegen for the example in the description is now: > > vpxor %xmm2, %xmm2, %xmm2 > > vpcmpeqq %xmm2, %xmm0, %xmm0 > > vmaskmovpd (%rdi), %xmm0, %xmm2 > > vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 > > retq > > > > > > So I think it's the same question: bitwise select or branch? > > That's for AVX1 targets, the original example was for SSE42 - but > v8i16/v16i8 tests on AVX1 has similar issues. I suspect the answer in general is: "if it can be bitwise select, it probably should be"
(In reply to Roman Lebedev from comment #12) > (In reply to Simon Pilgrim from comment #11) > > (In reply to Sanjay Patel from comment #10) > > > And codegen for the example in the description is now: > > > vpxor %xmm2, %xmm2, %xmm2 > > > vpcmpeqq %xmm2, %xmm0, %xmm0 > > > vmaskmovpd (%rdi), %xmm0, %xmm2 > > > vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 > > > retq > > > > > > > > > So I think it's the same question: bitwise select or branch? > > > > That's for AVX1 targets, the original example was for SSE42 - but > > v8i16/v16i8 tests on AVX1 has similar issues. > > I suspect the answer in general is: > "if it can be bitwise select, it probably should be" Maybe for loads - but for that to work we need to guarantee that the entire range is dereferencable - I'm not sure if masked loads can/should currently fold to regular loads in that case?
For x86, I think we have to know/prove that the ends of the masked load are actual loads in order to convert the whole thing to a regular load. http://lists.llvm.org/pipermail/llvm-dev/2016-April/097927.html