Current Codegen: https://godbolt.org/z/n0UB_k Pulled out of the c-ray benchmark: #define ERR_MARGIN 1e-6 bool cmp_lt_gt(double a, double b, double c) { double t1 = (-b + c) / (2.0 * a); double t2 = (-b - c) / (2.0 * a); if((t1 < ERR_MARGIN && t2 < ERR_MARGIN) || (t1 > 1.0 && t2 > 1.0)) return 0; return 1; } SLP fails to create AND reductions for either of these, let alone merge them into a single (tweaked) reduction (and branch). Oddly it also manages to vectorize only one of the comparisons but then fails to form a reduction for the result. clang -g0 -O3 -march=btver2 -emit-llvm define i1 @_Z9cmp_lt_gtddd(double %0, double %1, double %2) { %4 = fneg double %1 %5 = fmul double %0, 2.000000e+00 %6 = insertelement <2 x double> undef, double %4, i32 0 %7 = insertelement <2 x double> %6, double %2, i32 1 %8 = insertelement <2 x double> undef, double %2, i32 0 %9 = insertelement <2 x double> %8, double %1, i32 1 %10 = fsub <2 x double> %7, %9 %11 = insertelement <2 x double> undef, double %5, i32 0 %12 = shufflevector <2 x double> %11, <2 x double> undef, <2 x i32> zeroinitializer %13 = fdiv <2 x double> %10, %12 %14 = extractelement <2 x double> %13, i32 1 %15 = fcmp olt double %14, 0x3EB0C6F7A0B5ED8D %16 = extractelement <2 x double> %13, i32 0 %17 = fcmp olt double %16, 0x3EB0C6F7A0B5ED8D %18 = and i1 %15, %17 br i1 %18, label %24, label %19 19: ; preds = %3 %20 = fcmp ule <2 x double> %13, <double 1.000000e+00, double 1.000000e+00> %21 = extractelement <2 x i1> %20, i32 0 %22 = extractelement <2 x i1> %20, i32 1 %23 = or i1 %21, %22 ret i1 %23 24: ; preds = %3 ret i1 false }
HorizontalReduction.tryToReduce bails out of <2 x X> reduction cases, which seems to prevent the fcmp vectorization as well.
(In reply to Simon Pilgrim from comment #1) > HorizontalReduction.tryToReduce bails out of <2 x X> reduction cases, which > seems to prevent the fcmp vectorization as well. Would this handle it? https://reviews.llvm.org/D59710
(In reply to Sanjay Patel from comment #2) > (In reply to Simon Pilgrim from comment #1) > > HorizontalReduction.tryToReduce bails out of <2 x X> reduction cases, which > > seems to prevent the fcmp vectorization as well. > > Would this handle it? > https://reviews.llvm.org/D59710 Yes I think so, if we don't want to introduce <2 x X> reductions for whatever reason, we would catch this in the backend if only the fcmp had vectorized.
https://godbolt.org/z/EKQcUy This contains an ideal version that merges the reductions into a single result: bool cmp_lt_gt_sse(double a, double b, double c) { double t1 = (-b + c) / (2.0 * a); double t2 = (-b - c) / (2.0 * a); __m128d t12 = _mm_setr_pd(t1, t2); __m128d lo = _mm_cmplt_pd(t12, _mm_set1_pd(ERR_MARGIN)); __m128d hi = _mm_cmpgt_pd(t12, _mm_set1_pd(1.0)); __m128d reduce = _mm_and_pd( _mm_unpacklo_pd(lo, hi), _mm_unpackhi_pd(lo, hi) ); if (_mm_movemask_pd(reduce) != 0) return 0; return 1; }
D59710 became a restricted first step to avoid regressions on other cases: https://reviews.llvm.org/rG7ff57705ba196ce649d6034614b3b9df57e1f84f
VectorCombine proposal that would create the missing reduction pattern (not intrinsic though): https://reviews.llvm.org/D82474 I'm not sure how we get from there to the ideal code from comment 4. We could start by converting reduction patterns to intrinsics in VectorCombine.
(In reply to Sanjay Patel from comment #6) > I'm not sure how we get from there to the ideal code from comment 4. > We could start by converting reduction patterns to intrinsics in > VectorCombine. Handling in the backend should be relatively trivial with the recent work on CMP(MOVMSK()) folds.
(In reply to Simon Pilgrim from comment #7) > (In reply to Sanjay Patel from comment #6) > > I'm not sure how we get from there to the ideal code from comment 4. > > We could start by converting reduction patterns to intrinsics in > > VectorCombine. > > Handling in the backend should be relatively trivial with the recent work on > CMP(MOVMSK()) folds. After https://reviews.llvm.org/rGb6315aee5b42, we have this IR: define i1 @_Z9cmp_lt_gtddd(double %a, double %b, double %c) { entry: %fneg = fneg double %b %mul = fmul double %a, 2.000000e+00 %0 = insertelement <2 x double> undef, double %c, i32 0 %1 = insertelement <2 x double> %0, double %fneg, i32 1 %2 = insertelement <2 x double> undef, double %b, i32 0 %3 = insertelement <2 x double> %2, double %c, i32 1 %4 = fsub <2 x double> %1, %3 %5 = insertelement <2 x double> undef, double %mul, i32 0 %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer %7 = fdiv <2 x double> %4, %6 %8 = fcmp olt <2 x double> %7, <double 0x3EB0C6F7A0B5ED8D, double 0x3EB0C6F7A0B5ED8D> %shift = shufflevector <2 x i1> %8, <2 x i1> undef, <2 x i32> <i32 1, i32 undef> %9 = and <2 x i1> %8, %shift %or.cond = extractelement <2 x i1> %9, i64 0 br i1 %or.cond, label %cleanup, label %lor.lhs.false lor.lhs.false: ; preds = %entry %10 = fcmp ule <2 x double> %7, <double 1.000000e+00, double 1.000000e+00> %shift17 = shufflevector <2 x i1> %10, <2 x i1> undef, <2 x i32> <i32 1, i32 undef> %11 = or <2 x i1> %10, %shift17 %not.or.cond9 = extractelement <2 x i1> %11, i32 0 ret i1 %not.or.cond9 cleanup: ; preds = %entry ret i1 false } ------------------------------------------------------------------- SDAG can't do anything with the ops in different basic blocks. This would require SimplifyCFG to speculatively execute the 2nd reduction plus some instcombine or other transforms to convert to a single reduction? Current x86 AVX asm looks like this: LCPI0_0: .quad 0x8000000000000000 ## double -0 .quad 0x8000000000000000 ## double -0 LCPI0_1: .quad 0x3eb0c6f7a0b5ed8d ## double 9.9999999999999995E-7 .quad 0x3eb0c6f7a0b5ed8d ## double 9.9999999999999995E-7 LCPI0_2: .quad 0x3ff0000000000000 ## double 1 .quad 0x3ff0000000000000 ## double 1 .section __TEXT,__text,regular,pure_instructions .globl __Z9cmp_lt_gtddd .p2align 4, 0x90 __Z9cmp_lt_gtddd: ## @_Z9cmp_lt_gtddd .cfi_startproc ## %bb.0: ## %entry vxorpd LCPI0_0(%rip), %xmm1, %xmm3 vaddsd %xmm0, %xmm0, %xmm0 vunpcklpd %xmm3, %xmm2, %xmm3 ## xmm3 = xmm2[0],xmm3[0] vunpcklpd %xmm2, %xmm1, %xmm1 ## xmm1 = xmm1[0],xmm2[0] vsubpd %xmm1, %xmm3, %xmm1 vmovddup %xmm0, %xmm0 ## xmm0 = xmm0[0,0] vdivpd %xmm0, %xmm1, %xmm0 vcmpltpd LCPI0_1(%rip), %xmm0, %xmm1 vmovmskpd %xmm1, %eax cmpb $3, %al jne LBB0_1 ## %bb.2: ## %cleanup xorl %eax, %eax retq LBB0_1: ## %lor.lhs.false vmovapd LCPI0_2(%rip), %xmm1 ## xmm1 = [1.0E+0,1.0E+0] vcmpnltpd %xmm0, %xmm1, %xmm0 vmovmskpd %xmm0, %eax testb %al, %al setne %al retq
In c-ray 1.1, gcc 11 is 94% faster than clang 12. https://www.phoronix.com/scan.php?page=article&item=clang12-gcc11-icelake&num=4
Current IR: define i1 @_Z9cmp_lt_gtddd(double %0, double %1, double %2) { %4 = fneg double %1 %5 = fsub double %2, %1 %6 = fmul double %0, 2.000000e+00 %7 = fdiv double %5, %6 %8 = fsub double %4, %2 %9 = fdiv double %8, %6 %10 = fcmp olt double %7, 0x3EB0C6F7A0B5ED8D %11 = fcmp olt double %9, 0x3EB0C6F7A0B5ED8D %12 = select i1 %10, i1 %11, i1 false br i1 %12, label %17, label %13 13: %14 = fcmp ule double %7, 1.000000e+00 %15 = fcmp ule double %9, 1.000000e+00 %16 = select i1 %14, i1 true, i1 %15 ret i1 %16 17: ret i1 false }