33013 – [x86] default codegen should be more branchy

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 33013 - [x86] default codegen should be more branchy

Summary: [x86] default codegen should be more branchy

Status:	NEW

Alias:	None

Product:	libraries
Classification:	Unclassified
Component:	Backend: X86 (show other bugs)
Version:	trunk
Hardware:	PC All

Importance:	P enhancement
Assignee:	Unassigned LLVM Bugs

URL:
Keywords:

Depends on:
Blocks:

Reported:	2017-05-11 16:27 PDT by Sanjay Patel
Modified:	2017-09-05 06:00 PDT (History)
CC List:	9 users (show)

See Also:	31602
Fixed By Commit(s):

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Sanjay Patel 2017-05-11 16:27:32 PDT

The default x86 target is something like an Intel big core (ie, it's good at absorbing the cost of correctly predicted branches, good at predicting those branches, and good at speculating execution past those branches).

Therefore, we shouldn't favor cmov codegen for IR select as much as we currently do. Example:

int foo(float x) {
  if (x < 42.0f)
    return x;
  return 12;
}

define i32 @foo(float %x) {
  %cmp = fcmp olt float %x, 4.200000e+01
  %conv = fptosi float %x to i32
  %ret = select i1 %cmp, i32 %conv, i32 12
  ret i32 %ret
}

$ clang -O2 cmovfp.c -S -o -
    movss    LCPI0_0(%rip), %xmm1    ## xmm1 = mem[0],zero,zero,zero
    ucomiss    %xmm0, %xmm1
    cvttss2si    %xmm0, %ecx
    movl    $12, %eax
    cmoval    %ecx, %eax
    retq

Note that gcc and icc will use compare and branch on this example.

Comment 1 Sanjay Patel 2017-05-11 17:25:51 PDT

An example from https://groups.google.com/forum/#!topic/llvm-dev/Tl9SD-edUJI where the select is not explicit in IR:

define i64 @cvtf(float %f) {
  %cvt = fptoui float %f to i64
  ret i64 %cvt
}

	movss	LCPI0_0(%rip), %xmm1    ## xmm1 = mem[0],zero,zero,zero
	movaps	%xmm0, %xmm2
	subss	%xmm1, %xmm2
	cvttss2si	%xmm2, %rax
	movabsq	$-9223372036854775808, %rcx ## imm = 0x8000000000000000
	xorq	%rax, %rcx
	cvttss2si	%xmm0, %rax
	ucomiss	%xmm1, %xmm0
	cmovaeq	%rcx, %rax
	retq

Comment 2 Michael Clark 2017-05-11 18:47:11 PDT

Another issue related to the use of predicate logic versus a branch is the unconditional execution of ISD::FSUB which causes FE_INEXACT for float to u64 and double to u64 conversion. See test code and workaround below.

See SelectionDAGLegalize::ExpandNode()

  case ISD::FP_TO_UINT: {
    SDValue True, False;
    EVT VT =  Node->getOperand(0).getValueType();
    EVT NVT = Node->getValueType(0);
    APFloat apf(DAG.EVTToAPFloatSemantics(VT),
                APInt::getNullValue(VT.getSizeInBits()));
    APInt x = APInt::getSignMask(NVT.getSizeInBits());
    (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
    Tmp1 = DAG.getConstantFP(apf, dl, VT);
    Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
                        Node->getOperand(0),
                        Tmp1, ISD::SETLT);
    True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
    // TODO: Should any fast-math-flags be set for the FSUB?
    False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
                        DAG.getNode(ISD::FSUB, dl, VT,
                                    Node->getOperand(0), Tmp1));
    False = DAG.getNode(ISD::XOR, dl, NVT, False,
                        DAG.getConstant(x, dl, NVT));
    Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
    Results.push_back(Tmp1);
    break;
  }



Expected output:

$ clang  fcvt-bug.cc 
$ ./a.out 
1 exact
1 inexact
1 exact
1 inexact
1 exact
1 inexact
1 exact
1 inexact

Erroneous output:

$ clang -DDISABLE_WORKAROUND fcvt-bug.cc 
$ ./a.out 
1 exact
1 inexact
1 inexact
1 inexact
1 exact
1 inexact
1 inexact
1 inexact

$ cat fcvt-bug.cc
#include <cstdio>
#include <cmath>
#include <limits>
#include <fenv.h>

typedef float              f32;
typedef double             f64;
typedef signed int         s32;
typedef unsigned int       u32;
typedef signed long long   s64;
typedef unsigned long long u64;

#if defined __GNUC__ && defined __x86_64__ && !defined DISABLE_WORKAROUND

__attribute__ ((noinline)) s64 fcvt_s_lu(f32 f)
{
        s64 res;
        static const union { f32 f; u32 u; } val = { .u = 0x5f000000 };
        if (f > val.f) {
                __asm__(
                        "subss      %[sub], %[input]\n  "
                        "movabsq    $0x8000000000000000, %%rdx\n        "
                        "cvttss2siq %[input], %[result]\n       "
                        "xorq       %%rdx, %[result]"
                 : [result] "=r"(res)
                 : [sub] "m"(val.f), [input] "x"(f)
                 : "rdx"
                );
        } else {
                __asm__(
                        "cvttss2siq %[input], %[result]"
                 : [result] "=r"(res)
                 : [input] "x"(f)
                );
        }
        return res;
}

__attribute__ ((noinline)) s64 fcvt_d_lu(f64 f)
{
        s64 res;
        static const union f64_bits { f64 f; u64 u; } val = { .u = 0x43E0000000000000ULL };
        if (f > val.f) {
                __asm__(
                        "subsd      %[sub], %[input]\n  "
                        "movabsq    $0x8000000000000000, %%rdx\n        "
                        "cvttsd2siq %[input], %[result]\n       "
                        "xorq       %%rdx, %[result]"
                 : [result] "=r"(res)
                 : [sub] "m"(val.f), [input] "x"(f)
                 : "rdx"
                );
        } else {
                __asm__(
                        "cvttsd2siq %[input], %[result]"
                 : [result] "=r"(res)
                 : [input] "x"(f)
                );
        }
        return res;
}

#else

__attribute__ ((noinline)) s64 fcvt_s_lu(f32 f) { return s64(u64(f)); }
__attribute__ ((noinline)) s64 fcvt_d_lu(f64 f) { return s64(u64(f)); }

#endif

__attribute__ ((noinline)) s32 fcvt_s_wu(f32 f) { return s32(u32(f)); }
__attribute__ ((noinline)) s32 fcvt_d_wu(f64 f) { return s32(u32(f)); }

void test_fcvt_s_wu(f32 a)
{
        feclearexcept(FE_ALL_EXCEPT);
        printf("%d ", fcvt_s_wu(a));
        printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}

void test_fcvt_s_lu(f32 a)
{       
        feclearexcept(FE_ALL_EXCEPT);
        printf("%lld ", fcvt_s_lu(a));
        printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}

void test_fcvt_d_wu(f64 a)
{
        feclearexcept(FE_ALL_EXCEPT);
        printf("%d ", fcvt_d_wu(a));
        printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}

void test_fcvt_d_lu(f64 a)
{       
        feclearexcept(FE_ALL_EXCEPT);
        printf("%lld ", fcvt_d_lu(a));
        printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}

int main()
{
        fesetround(FE_TONEAREST);

        test_fcvt_s_wu(1.0f);
        test_fcvt_s_wu(1.1f);
        test_fcvt_s_lu(1.0f);
        test_fcvt_s_lu(1.1f);
        test_fcvt_d_wu(1.0f);
        test_fcvt_d_wu(1.1f);
        test_fcvt_d_lu(1.0f);
        test_fcvt_d_lu(1.1f);
}

Comment 3 Michael Clark 2017-05-11 19:48:11 PDT

Given that the primary issue is the speed of branch vs predicate logic, here is a microbenchmark of the two approaches:

$ time fcvt-branch

real	0m0.208s
user	0m0.201s
sys	0m0.002s

$ time fcvt-cmov

real	0m0.241s
user	0m0.235s
sys	0m0.002s

int main() {
	static const union { float f; unsigned u; } val = { .u = 0x5f000000 };
	for (unsigned long i = 0; i < 100000000; i++) {
		unsigned long res;
		float f = 1.0f;
#ifndef USE_CMOV
		__asm__ volatile (
			"ucomiss    %[sub], %[input]\n	"
			"ja 1f\n	"
			"cvttss2siq %[input], %[result]\n	"
			"jmp 2f\n	"
			"1:\n	"
			"subss      %[sub], %[input]\n	"
			"movabsq    $0x8000000000000000, %%rdx\n	"
			"cvttss2siq %[input], %[result]\n	"
			"xorq       %%rdx, %[result]\n	"
			"2: "
		 : [result] "=r"(res)
		 : [sub] "m"(val.f), [input] "x"(f)
		 : "rdx"
		);
#else
		__asm__ volatile (
        		"movss   %[sub], %%xmm1\n	"
        		"movaps  %[input], %%xmm2\n	"
        		"subss   %%xmm1, %%xmm2\n	"
        		"cvttss2si       %%xmm2, %[result]\n	"
        		"movabsq $0x8000000000000000, %%rdx\n	"
        		"xorq    %[result], %%rdx\n	"
        		"cvttss2si       %[input], %[result]\n	"
        		"ucomiss %%xmm1, %[input]\n	"
        		"cmovaeq %%rdx, %[result]\n	"
		 : [result] "=r"(res)
		 : [sub] "m"(val.f), [input] "x"(f)
		 : "rdx", "xmm1", "xmm2"
		);
#endif
	}
}

Comment 4 Sanjay Patel 2017-05-12 06:49:52 PDT

(In reply to Michael Clark from comment #3)
> Given that the primary issue is the speed of branch vs predicate logic, here
> is a microbenchmark of the two approaches:

As written, this is the absolute best case for the branching version, right? we should vary the input values to exercise the predictor, so we know when/if the cmov version is ever better.

If you can provide some info about your test machine, that would be good too. It's quite likely that the data and our conclusions are going to be different for say Intel Skylake vs. AMD Jaguar.

For now, I'm assuming that x86 needs to do something like this:
https://reviews.llvm.org/rL292154

This is for 2 reasons:
1. Converting selects to branches early could have a negative impact on DAGCombiner/isel because that's limited to a basic block. We actually do convert some selects early in CodeGenPrepare::optimizeSelectInst(), but that's limited to the case where we have a known expensive op (division for example) and/or metadata (profile info) to guide that decision.

2. As shown in the fptoui example, the select may not exist in IR. In that case, there's not much choice. We have to do this transform later unless we're willing to expand fptoui into the equivalent IR that contains a select and/or a compare+branch in CGP.

Comment 5 Peter Cordes 2017-05-12 08:58:07 PDT

There are certainly cases where cmov's data dependency is a bottleneck, instead of a control dependency that can be speculated and that avoids doing some work.  e.g. http://stackoverflow.com/questions/28875325/gcc-optimization-flag-o3-makes-code-slower-then-o2 shows a case where a cmov data dependency (from gcc -O3) is slower than a conditional branch (from gcc -O2), for sorted input that leads to perfect branch prediction.

But caution is required here.  More branchy code could be slower, even with predictable branches.

I'm not sure what heuristics would be most useful in guiding the choice between branchy vs. branchless.  Obviously saving a lot of work in one side of the branch is a big one, but also creating loop-carried dep chains that would be the biggest bottleneck is another.  (In that SO question, unrolling with multiple accumulators would have avoided a latency bottleneck.)

----

More total branches would take up more branch-predictor entries, which would likely mean that on average branches are predicted less well.  This is another effect that microbenchmarks won't reveal.

On some CPUs, never-taken branches don't use any branch-prediction resources (since fall-through is the default prediction), but on at least some Intel CPUs, they probably do affect and are affected by other branches.  http://agner.org/optimize/ says that recent Intel CPUs don't use static branch prediction, but instead 


Also, taken branches always have the potential to reduce front-end throughput, even when predicted perfectly.  CPUs have buffers between fetch and decode to hide fetch bubbles created by branches, but too many could expose them.  uop-caches do help with this, but they aren't trace caches (so a taken branch usually means that the next uop comes from a different line of the uop cache).

----

Those inline-asm sequences have a flaw which hurts the branching version: The fast-path has a taken JMP, because it has to jump over the slow-path instead of putting it out-of-line at the end of the function.  (This is only an inline-asm problem; gcc and clang of course already put unlikely branch targets out-of-line and have them jump back, so this is only a problem with the test methodology).

Taken-branch throughput can be an issue inside a tight loop like the one you're testing.  (e.g. Haswell/Skylake can execute/retire 2 not-taken branches per clock, but only one taken branch per 2 clocks.  Loop-branches are a special case, so tiny loops can run at 1 iteration per clock.)

Comment 6 Peter Cordes 2017-05-12 09:37:15 PDT

Oops, forgot to finish writing the paragraph about static branch prediction, and whether modern Intel CPUs use it or not.  Matt Godbolt did some investigation of Nehalem vs. Haswell: https://xania.org/201602/bpu-part-two.

Also forgot to mention: on Intel Pentium-M/Core2 through Haswell, cmov is 2 uops with 2c latency.  On Broadwell/SKL, it's 1 uop (like ADC/SBB and some other 3-input instructions, taking advantage of the 3-input uop support introduced for FMA in Haswell).


CMOV is only 1 uop / m-op on AMD, and on Silvermont (but it is 2c latency on Silvermont).

Anyway, on much of the current installed-base of x86 CPUs, CMOV is 2 uops / 2c latency, but that will change as more older Intel CPUs are replaced.

Of course, CMOV isn't the only option for branchless code.  Especially on CPUs where it's 2 uops, an alternative like SAR/AND (or BMI2 ANDN) can be good vs. CMP/CMOV.  See this interesting case of compiler hand-holding for example: http://stackoverflow.com/questions/34407437/what-is-the-efficient-way-to-count-set-bits-at-a-position-or-lower/34410357#34410357, where we couldn't avoid a CMP or TEST because POPCNT clobbers flags, so SAR/AND is no worse than TEST/CMOV even on CPUs with efficient CMOV.

Comment 7 Sanjay Patel 2017-05-12 13:00:42 PDT

Thanks, Peter. 

One more variable to consider here: what is the size difference between the cmov and branch versions? I haven't counted the bytes, but that may tip the scale one way or the other when optimizing for size. And even nastier, are we supposed to be aligning the taken target block with nops? I think we do that for loop start blocks, but I haven't seen it with if/then.

Comment 8 Peter Cordes 2017-05-12 13:44:49 PDT

> what is the size difference between the cmov and branch versions?

for gcc's versions:

branchless: (sub-optimal, should avoid the MOV with an LEA)
  4008d0:       48 63 10                movsxd rdx,DWORD PTR [rax]
  4008d3:       48 89 d1                mov    rcx,rdx
  4008d6:       48 01 da                add    rdx,rbx
  4008d9:       83 c1 80                add    ecx,0xffffff80
  4008dc:       48 0f 4d da             cmovge rbx,rdx
  4008e0:       48 83 c0 04             add    rax,0x4
  4008e4:       4c 39 e0                cmp    rax,r12
  4008e7:       75 e7                   jne    4008d0 <main+0xc0>
0x18 bytes of code, could be 0x16 I think

branchy:
  400a20:       4d 63 1a                movsxd r11,DWORD PTR [r10]
  400a23:       41 83 fb 7f             cmp    r11d,0x7f
  400a27:       7e 7d                   jle    400aa6 <main+0x116>
  400a29:       49 83 c2 04             add    r10,0x4
  400a2d:       4c 01 db                add    rbx,r11
  400a30:       4d 39 ca                cmp    r10,r9
  400a33:       75 eb                   jne    400a20 <main+0x90>
0x14 bytes of code.  (Could be 0x13 if gcc picked a reg that didn't need a REX prefix for the 32-bit cmp).

Sorry for using gcc output in this clang bug, but I already had the compiled binaries there to disassemble.  I think it's going to be pretty common for the branchless version to be a little bit larger, especially when it requires constants in registers (because CMOV doesn't take immediate operands).


> are we supposed to be aligning the taken target block with nops?

Definitely not, since the NOP would run inside the loop.  Branch-target alignment is not very important in CPUs with uop-caches, and completely irrelevant when running from the loop buffer.  (The loop buffer *does* act like a trace cache for tiny loops, I think, so the uop following a taken branch can issue as part of the same group as the branch.)

Putting even a single long-NOP inside the loop to align a non-loop branch target would hurt the front-end more than it helped, because that NOP would take up issue/retire bandwidth.

The alignment of branches (the jcc, not the target) can affect performance because it determines which other branches it aliases, so it can be hard to measure just the effect of branch alignment.  Changing the alignment of anything tends to change the alignment of everything else in the program.


It could possibly help some to stretch out the real instructions in the loop by using redundant prefixes or longer addressing modes / immediates than necessary, if that avoids having the branch target be very near the end of a 16B-aligned block, but I'm not sure about that.  (And unfortunately there's no mechanism in gas or NASM syntax, or any other asm syntax I'm aware of for requesting that.)

Decreasing code-density is a risky bet, especially when you don't have profiling data to show which loops are truly hot and actually bottlenecked on front-end throughput, not an execution unit, latency, or memory.

Comment 9 Michael Clark 2017-05-12 14:18:11 PDT

With respect to microbenchmark using static values, i.e. always predicted branches, not being representative of a real workload, then one would ask the question of what would be representative of an actual workload.

1. Values vary but are less than LLONG_MAX, 100% predicted branches
2. Values vary but a few outliers are out of range, >99% predicted branches
3. Pathological case. i.e. floating point noise values

Use case number 3 seems unusual as one developing software would typically convert values that they would expect would be in range versus what would essentially be a binary conversion of 0 or ULLONG_MAX for random values with random exponents that mostly fall outside of the range of u64. Actual code would be more likely to calculate a max value to scale the floating point values before a conversion (having written such code many times).

In essence, I am saying it would be fair to say that in the majority of code that the branch prediction rate is likely to be near the order of >99%, perhaps closer to 100% in most cases.

Comment 10 Michael Clark 2017-05-12 14:30:46 PDT

Adding to that, whether the choice to use size_t / unsigned is because the value is intended to be positive versus gaining the additional range between LLONG_MAX and ULLONG_MAX.

Values from to 2^53 to 2^64, of course, will all be inexact conversions, and for the signed case of +/-2^53 to +/-2^63 are already handled in hardware with the MXCSR INEXACT bit being set by cvttss2si/cvttsd2si

Comment 11 Peter Cordes 2017-05-12 19:42:02 PDT

(In reply to Michael Clark from comment #9)
> always predicted branches not being representative of a real workload

Agreed that a branchy float->u64 conversion will predict extremely well in most code.  But it's not necessarily representative of other cases that are only say 98% predictable, which is the big-picture for this bug.


getting slightly off-topic here, I know you brought this up as a silly example of what *not* to optimize for.

> 3. Pathological case. i.e. floating point noise values

> essentially be a binary conversion of 0 or ULLONG_MAX for random values with
> random exponents that mostly fall outside of the range of u64.

Fun fact about IEEE754: Half of all representable values have magnitude less than 1.0.  Most have a magnitude below LLONG_MAX, so cvtss2si %xmm, %r64 will produce something other than the "integer indefinite" value (1ULL<<63) that Intel documents for out-of-range conversions: http://felixcloutier.com/x86/CVTSS2SI.html.

The range of representable values is huge, but they get sparser the farther away from 0 you get.

For negative floats, your conversion code appears to implement the semantics of (uint64_t)(int64_t)f.

That's what C programmers should write if they want an unsigned result but don't need the extra input range, and want efficiency on x86 without AVX512 (and probably some other architectures which only provide float<->signed conversions).  Unfortunately C makes it hard to tell the compiler what you do/don't care about without making the code really ugly.


As far as the predictability of the branch:  only bit patterns from 0x5f000000 to 0x7f7fffff and +Inf (0x7f800000) compare >= (float)(1ULL<<63).  (Exponent=255 and non-zero mantissa is a NaN, which is unordered.)  Check it out on https://www.h-schmidt.net/FloatConverter/IEEE754.html.

----

BTW, you can use LEA instead of XORQ so you can hoist the movabsq of the constant out of the loop without needing an extra MOV.  XORing to flip the MSB is the same as adding to flip the MSB: with nowhere for the carry to go, add and add-without-carry are the same.  (And so is subtract).

Unfortunately I don't see a way to use CMPLTPS/ANDPS to do the branchless stuff with only one cvttss2si (and avoid integer CMOV), unless we do the conditional sign-bit flip with vectors, too (e.g. integer left-shift of a compare result and XORPS, or maybe PSIGND).  (And that would still change the result in the positive out-of-range because we'd get the "integer indefinite" value and *not* flip the MSB to make it 0 instead of 1ULL<<63.)

Comment 12 Peter Cordes 2017-05-12 20:10:50 PDT

(In reply to Michael Clark from comment #10)
> Values from to 2^53 to 2^64, of course, will all be inexact conversions, and
> for the signed case of +/-2^53 to +/-2^63 are already handled in hardware
> with the MXCSR INEXACT bit being set by cvttss2si/cvttsd2si

Are you sure about that?

Not all integers are exactly representable as float/double (beyond 2^24 or 2^53), but all floats/doubles in that range represent integers.  e.g. all the even integers from 2^24 to 2^25 are exactly representable.  (2^24+1 is the first number that can't be exactly represented by IEEE float32 http://stackoverflow.com/questions/3793838/which-is-the-first-integer-that-an-ieee-754-float-is-incapable-of-representing-e).

Put another way, when the bias-corrected exponent is greater than the number of mantissa bits, multiplying the mantissa by 2^exponent shifts the decimal point (err, base-2 radix point?) to the right all the way out of the mantissa, leaving some zeros in the LSB of the base2 integer representation.

I haven't tested, but why would hardware raise the inexact exception in that case?

Comment 13 Michael Clark 2017-05-13 00:19:58 PDT

Sure you there are some representable values but there is a loss of precision. 

I would have to read the IEEE-754 spec to be sure on the exact rules regarding exact vs inexact conversions but there is a loss of precision above 2^53 such that a precise countable integer can not be distinguished due to right most bits of the significand that cannot be represented, by definition precision has to be lost as while a magnitude of 2^64 can be represented there are only repsectively 52 bits and 23 bits of precision for double and single.

At magnitudes around 2^63 there are 10 bits of double precision lost and 39 bits of single precision lost so the conversion will alias with values increasing in magnitude by 2^10 for double to u64 and 2^39 for single precision to u64. You can do an exact conversion to float or double for specific values but not the other way.

This is I believe why the subtraction of 2^63 causes the inexact flag to be set (which is done to allow the use of the signed hardware conversion to emulate an unsigned conversion).

There can only be an exact conversion for positive 0 to < 2^53 for double precision values and 0 to < 2^24 for single precision values.

In my particular use case I am emulating a RISC-V processor that has an unsigned conversions and I am running the RISC-V compliance tests on the results of the conversion intrinsics generated by the compiler.

I understand AVX-512 implements native unsigned conversions, in which case its handled by the hardware.

Comment 14 Peter Cordes 2017-05-15 14:17:59 PDT

Sorry for continuing the off-topic discussion in this thread, will keep this short.

(In reply to Michael Clark from comment #13)
> I would have to read the IEEE-754 spec to be sure on the exact rules
> regarding exact vs inexact conversions

If the rounding step changes the value, it's inexact.  If not, it's exact.

For a fp->int conversion, it's a round to integer of the fp value.  For a subtraction, it's a round to float or double of the result.

In the float->u64 case, it should be possible for both those operations to be exact, e.g. 1.5*2^63 - 2^63 = 2^62 exactly.  (1.5 is a possible mantissa value).

I tested what x86 hardware conversion (on Skylake) does for an int->float->int round trip, using some printing around this:
	float f = a;
	long long roundtrip = f;
clang uses the expected cvtsi2ss and cvttss2si.


With input = 64 + (1LL << 30), rounding to nearest float loses the +64
inexact 0x40000040(1073741888) -> 1073741824.000000 -> 0x40000000(1073741824)

With input of 128 + (1LL << 30), that doesn't happen.
exact   0x40000080(1073741952) -> 1073741952.000000 -> 0x40000080(1073741952)

Comment 15 Michael Clark 2017-05-15 14:59:23 PDT

Peter, when you say exact, you mean the inexact flag is not set in MXCSR? after the conversion or that the imprecise floating point aliases to an exact integer representation? (i.e. you can perform a round trip and get the same value). Subtle difference.

I'll run your test and check the MXCSR flags...

Comment 16 Michael Clark 2017-05-15 15:52:11 PDT

The imprecise float is "aliasing" with an integer representation. float can't represent more than 24 bits of precision (counting the leading 1 bit). It's arguable as to whether the implementation is doing the right thing for magnitudes above the precision of the floating point type. The hardware is reporting an exact conversion to an integer when it can't know if the conversion is exact (using signed integers in this test as the subtract in the unsigned conversion can cause inexact). I guess "unknown" is not INEXACT.

$ cat roundtrip.c
#include <stdio.h>
#include <math.h>
#include <fenv.h>

typedef float f32;
typedef long long s64;

__attribute__ ((noinline)) s64 fcvt_lu_s(f32 f) { return (s64)f; }
__attribute__ ((noinline)) f32 fcvt_s_lu(s64 u) { return (f32)u; }

void test_fcvt(s64 u)
{
	#pragma STDC FENV_ACCESS on

	f32 f;
	s64 v;
	int nx;

	feclearexcept(FE_ALL_EXCEPT);

	nx = fetestexcept(FE_INEXACT);
	printf("%lld %s\n", u, nx ? "inexact" : "exact");

	feclearexcept(FE_ALL_EXCEPT);
	f = fcvt_s_lu(u);
	nx = fetestexcept(FE_INEXACT);
	printf("%f %s\n", f, nx ? "inexact" : "exact");

	feclearexcept(FE_ALL_EXCEPT);
	v = fcvt_s_lu(f);
	nx = fetestexcept(FE_INEXACT);
	printf("%lld %s\n", v, nx ? "inexact" : "exact");
}

int main()
{
	fesetround(FE_TONEAREST);

	test_fcvt((1 << 20) + 64);
	test_fcvt((1 << 20) + 128);
	test_fcvt((1 << 30) + 64);
	test_fcvt((1 << 30) + 128);
}

$ gcc roundtrip.c -o roundtrip
$ ./roundtrip
1048640 exact
1048640.000000 exact
1048640 exact
1048704 exact
1048704.000000 exact
1048704 exact
1073741888 exact
1073741824.000000 inexact
1073741824 exact
1073741952 exact
1073741952.000000 exact
1073741952 exact

Comment 17 Sanjay Patel 2017-07-01 06:51:14 PDT

A cmov-to-branch pass for x86 has been proposed here:
https://reviews.llvm.org/D34769

Comment 18 Simon Pilgrim 2017-09-05 06:00:49 PDT

(In reply to Sanjay Patel from comment #17)
> A cmov-to-branch pass for x86 has been proposed here:
> https://reviews.llvm.org/D34769

This was committed at rL308142