I recently imported a clang 5.0.0 snapshot (from just before the release_50 branch) into FreeBSD 12.0-CURRENT, and soon after we noticed that one of our libm regression tests was failing, in particular the sinl(3) test. It turns out that this failure was introduced by r307529 ("This patch completely replaces the scheduling information for the SandyBridge architecture target by modifying the file X86SchedSandyBridge.td located under the X86 Target"), see also https://reviews.llvm.org/D35019. I did a bit of investigation, and it turned out that r307529 changes the x87 instruction order, but I do not yet know if this is due to some sort of undefined behavior in the sinl code, or a bug in LLVM. The code in question can be found here: https://github.com/freebsd/freebsd/blob/master/lib/msun/ld80/e_rem_pio2l.h The part that seems to be miscompiled is the for loop from line 137 onwards, looking like: 43 static const double 44 zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */ 45 two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */ ... 77 long double z,w,t,r,fn; 78 double tx[3],ty[2]; 79 int e0,ex,i,j,nx,n; ... 136 z = u1.e; 137 for(i=0;i<2;i++) { 138 tx[i] = (double)((int32_t)(z)); 139 z = (z-tx[i])*two24; 140 } 141 tx[2] = z; Before r307529, this loop is unrolled to: #DEBUG_VALUE: sinl:z <- [DW_OP_LLVM_fragment 0 80] %FP1 #DEBUG_VALUE: sinl:x <- %FP0 .loc 1 0 6 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:0:6 fstp %st(0) .loc 1 84 7 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:84:7 shll $16, %ebx .loc 1 135 19 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:135:19 movq %rax, -96(%rbp) movw $16406, -88(%rbp) # imm = 0x4016 fldt -96(%rbp) .Ltmp44: #DEBUG_VALUE: __ieee754_rem_pio2l:z <- %FP0 #DEBUG_VALUE: __ieee754_rem_pio2l:u1 <- [DW_OP_LLVM_fragment 0 80] %FP0 .loc 1 138 20 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20 fnstcw -68(%rbp) movzwl -68(%rbp), %eax movw $3199, -68(%rbp) # imm = 0xC7F fldcw -68(%rbp) movw %ax, -68(%rbp) fistl -72(%rbp) fldcw -68(%rbp) .loc 1 138 11 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:11 cvtsi2sdl -72(%rbp), %xmm0 .loc 1 138 9 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:9 movsd %xmm0, -48(%rbp) .loc 1 139 10 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:10 movsd %xmm0, -152(%rbp) .loc 1 139 9 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:9 fsubl -152(%rbp) .loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16 flds .LCPI0_1(%rip) fmul %st(0), %st(1) #DEBUG_VALUE: __ieee754_rem_pio2l:z <- %FP0 .loc 1 138 20 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20 fnstcw -66(%rbp) movzwl -66(%rbp), %eax movw $3199, -66(%rbp) # imm = 0xC7F fldcw -66(%rbp) movw %ax, -66(%rbp) fxch %st(1) fistl -76(%rbp) fldcw -66(%rbp) .loc 1 138 11 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:11 xorps %xmm0, %xmm0 cvtsi2sdl -76(%rbp), %xmm0 .loc 1 138 9 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:9 movsd %xmm0, -40(%rbp) .loc 1 139 10 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:10 movsd %xmm0, -144(%rbp) .loc 1 139 9 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:9 fsubl -144(%rbp) .loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16 fmulp %st(1) .Ltmp45: .loc 1 141 8 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:141:8 fstpl -32(%rbp) After r307529, it becomes: #DEBUG_VALUE: sinl:z <- [DW_OP_LLVM_fragment 0 80] %FP1 #DEBUG_VALUE: sinl:x <- %FP0 .loc 1 0 6 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:0:6 fstp %st(0) .loc 1 135 19 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:135:19 movq %rax, -96(%rbp) movw $16406, -88(%rbp) # imm = 0x4016 fldt -96(%rbp) .Ltmp44: #DEBUG_VALUE: __ieee754_rem_pio2l:z <- %FP0 #DEBUG_VALUE: __ieee754_rem_pio2l:u1 <- [DW_OP_LLVM_fragment 0 80] %FP0 .loc 1 138 20 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20 fnstcw -68(%rbp) movzwl -68(%rbp), %eax movw $3199, -68(%rbp) # imm = 0xC7F fldcw -68(%rbp) movw %ax, -68(%rbp) fistl -72(%rbp) fldcw -68(%rbp) .loc 1 138 11 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:11 cvtsi2sdl -72(%rbp), %xmm0 .loc 1 138 9 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:9 movsd %xmm0, -48(%rbp) .loc 1 139 10 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:10 movsd %xmm0, -152(%rbp) .loc 1 139 9 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:9 fsubl -152(%rbp) .loc 1 138 20 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20 fnstcw -66(%rbp) .loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16 flds .LCPI0_1(%rip) .loc 1 138 20 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20 movzwl -66(%rbp), %eax movw $3199, -66(%rbp) # imm = 0xC7F fldcw -66(%rbp) .loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16 fmul %st(0), %st(1) #DEBUG_VALUE: __ieee754_rem_pio2l:z <- %FP0 .loc 1 138 20 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20 movw %ax, -66(%rbp) fxch %st(1) fistl -76(%rbp) fldcw -66(%rbp) .loc 1 138 11 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:11 xorps %xmm0, %xmm0 cvtsi2sdl -76(%rbp), %xmm0 .Ltmp45: .loc 1 84 7 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:84:7 shll $16, %ebx .Ltmp46: .loc 1 138 9 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:9 movsd %xmm0, -40(%rbp) .loc 1 139 10 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:10 movsd %xmm0, -144(%rbp) .loc 1 139 9 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:9 fsubl -144(%rbp) .loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16 fmulp %st(1) .Ltmp47: .loc 1 141 8 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:141:8 fstpl -32(%rbp) Viewed as a context diff this looks as follows: @@ -179,7 +179,6 @@ jbe .LBB0_9 .LBB0_14: # %if.end80.i fstp %st(0) - shll $16, %ebx movq %rax, -96(%rbp) movw $16406, -88(%rbp) # imm = 0x4016 fldt -96(%rbp) @@ -194,18 +193,19 @@ movsd %xmm0, -48(%rbp) movsd %xmm0, -152(%rbp) fsubl -152(%rbp) - flds .LCPI0_1(%rip) - fmul %st(0), %st(1) fnstcw -66(%rbp) + flds .LCPI0_1(%rip) movzwl -66(%rbp), %eax movw $3199, -66(%rbp) # imm = 0xC7F fldcw -66(%rbp) + fmul %st(0), %st(1) movw %ax, -66(%rbp) fxch %st(1) fistl -76(%rbp) fldcw -66(%rbp) xorps %xmm0, %xmm0 cvtsi2sdl -76(%rbp), %xmm0 + shll $16, %ebx movsd %xmm0, -40(%rbp) movsd %xmm0, -144(%rbp) fsubl -144(%rbp) E.g., apart from the moving around of 'shll $16, %ebx', which does not influence the end result, the loading of the LCPI0_1 constant and multiplying it is moved further into the code, and that definitely does influence the result. I would like to get to the bottom of this before 5.0.0 is released, if possible, otherwise I would propose to revert r307529, at least in the release_50 branch. But it should still be fixed somehow. Currently I have a small test case which is derived from the libm code in e_rem_pio2l.h, but it does not seem to exhibit the same problem, for some reason that I do not know yet: #include <stdio.h> __attribute__((noinline)) void g(double *tx) { printf("tx[0]=%.20a tx[1]=%.20a tx[2]=%.20a\n", tx[0], tx[1], tx[2]); } static const double two24 = 1.67772160000000000000e+07; __attribute__((noinline)) void f(long double z) { int i; double tx[3]; for (i = 0; i < 2; i++) { tx[i] = (double)((int)(z)); z = (z - tx[i]) * two24; } tx[2] = z; g(tx); } int main(void) { f(0x1.b2f3ee96e76003260000p+23); return 0; } The correct values this should print are: tx[0]=0x1.b2f3ee00000000000000p+23, tx[1]=0x1.2dcec000000000000000p+22, tx[2]=0x1.93000000000000000000p+16
Out of interest, did you see this by setting your target-cpu to sandybridge or x86-64? The generic x86-64 cpu current uses the sandy bridge model by default which will expose this much wider.....
(In reply to Simon Pilgrim from comment #1) > Out of interest, did you see this by setting your target-cpu to sandybridge > or x86-64? The generic x86-64 cpu current uses the sandy bridge model by > default which will expose this much wider..... The default is to compile for generic x86-64. (Athough the end-user can override this using a CPUTYPE setting, this is not often done in practice.)
Interestingly, compiling with an explicit -march=sandybridge appears to make the code work correctly! So it seems there is still some difference between the 'generic' x86-64 CPU and sandybridge?
Here's a working small test case: #include <stdio.h> __attribute__((noinline)) int g(double *tx) { int res = (tx[2] != 0x1.93p+16); printf("tx[0]=%.20a tx[1]=%.20a tx[2]=%.20a: %s\n", tx[0], tx[1], tx[2], res ? "Bad!" : "OK"); return res; } static const double two24 = 1.67772160000000000000e+07; __attribute__((noinline)) int f(long double z) { int i; double tx[3]; for (i = 0; i < 2; i++) { tx[i] = (double)((int)(z)); z = (z - tx[i]) * two24; //printf("z=%.20La\n", z); } tx[2] = z; return g(tx); } int main(void) { return f(0x1.b2f3ee96e76003260000p+23L); } The correct result (when compiling with trunk before r307529) should be: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK The bad result (when compiling with trunk on or after r307529) is: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x0.00000000000000000000p+0: Bad! Again, the difference between produced assembly looks similar: --- pio2m-r307528.s +++ pio2m-r307529.s @@ -68,11 +68,11 @@ .Lcfi6: .cfi_def_cfa_register %rbp subq $64, %rsp - fldt 16(%rbp) fnstcw -4(%rbp) movzwl -4(%rbp), %eax movw $3199, -4(%rbp) # imm = 0xC7F fldcw -4(%rbp) + fldt 16(%rbp) movw %ax, -4(%rbp) fistl -8(%rbp) fldcw -4(%rbp) @@ -80,12 +80,12 @@ movsd %xmm0, -64(%rbp) movsd %xmm0, -32(%rbp) fsubl -32(%rbp) - flds .LCPI1_0(%rip) - fmul %st(0), %st(1) fnstcw -2(%rbp) + flds .LCPI1_0(%rip) movzwl -2(%rbp), %eax movw $3199, -2(%rbp) # imm = 0xC7F fldcw -2(%rbp) + fmul %st(0), %st(1) movw %ax, -2(%rbp) fxch %st(1) fistl -12(%rbp) E.g. the flds and fmul are moved apart somehow, and that influences the end result. Note that uncommenting the printf() call at the end of the for loop 'fixes' the problem, and then it shows the correct end result. But that is pretty much papering over the problem. :(
Generic x86-64 has the following features: def : ProcessorModel<"x86-64", SandyBridgeModel, [ FeatureX87, FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit, FeatureSlowBTMem ]>; It's just using the scheduling model for the instructions that it supports. But sandybridge/corei7-avx supports: def SNBFeatures : ProcessorFeatures<[], [ FeatureX87, FeatureMMX, FeatureAVX, FeatureFXSR, FeatureCMPXCHG16B, FeaturePOPCNT, FeatureAES, FeatureSlowDivide64, FeaturePCLMUL, FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, FeatureFastSHLDRotate ]>; class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, SNBFeatures.Value, [ FeatureSlowBTMem, FeatureSlowUAMem32 ]>; Any of those feature diffs can cause changes in isel which could affect final codegen + order, and there are plenty of other CPUs in between that use the model as well. Please can you try with yonah, core2, penryn and corei7 and westmere to see if any of those expose it as well? My current guess is a SSE vs AVX discrepancy.
Collision! Thanks for the test case, I'll compare the codegen
Btw, at least -O2 is needed to reproduce, I forgot to mention that, sorry.
(In reply to Simon Pilgrim from comment #5) > Please can you try with yonah, core2, penryn and corei7 and westmere to see > if any of those expose it as well? My current guess is a SSE vs AVX > discrepancy. clang r307529 doesn't accept -march=yonah, but the others all seem to result in working code: core2: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK penryn: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK nehalem: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK westmere: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK sandybridge: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK ivybridge: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK haswell: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK broadwell: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK skylake: tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK Apparently, only when you use plain x86-64, it starts failing...
The bug disappears when SSE3 is enabled which allows use of FISTTP to avoid control word manipulation for truncated stores.
I have a fix (add missing DEFS/USES for the FPSW reg) - I'll create a simpler test case and put it up for review tomorrow.
(In reply to Simon Pilgrim from comment #10) > I have a fix (add missing DEFS/USES for the FPSW reg) - I'll create a > simpler test case and put it up for review tomorrow. That fix fails on EXPENSIVE_CHECKS builds, so I don't have a solution right now. We can make a minor tweak to the sandy bridge model to hide the problem - disabling the FLDCW16m/FNSTCW16m entries is enough. I've been looking at X86TargetLowering::EmitInstrWithCustomInserter in case the FNSTCW insertion is wrong, but I haven't noticed anything. I've added the test case at rL310198 Note: X86InstrFPStack.td is missing Defs = [FPSW] for various x87 instructions, as they leave the status flags reg in an undefined state - that needs fixing but creating tests will be tricky. Did I mention I hate x87 programming?
(In reply to Simon Pilgrim from comment #11) > (In reply to Simon Pilgrim from comment #10) > > I have a fix (add missing DEFS/USES for the FPSW reg) - I'll create a > > simpler test case and put it up for review tomorrow. > > That fix fails on EXPENSIVE_CHECKS builds, so I don't have a solution right > now. > > We can make a minor tweak to the sandy bridge model to hide the problem - > disabling the FLDCW16m/FNSTCW16m entries is enough. What will be the side effects of such a change? > I've been looking at X86TargetLowering::EmitInstrWithCustomInserter in case > the FNSTCW insertion is wrong, but I haven't noticed anything. > > I've added the test case at rL310198 > > Note: X86InstrFPStack.td is missing Defs = [FPSW] for various x87 > instructions, as they leave the status flags reg in an undefined state - > that needs fixing but creating tests will be tricky. > > Did I mention I hate x87 programming? Heh, it's a can of worms, indeed.
(In reply to Dimitry Andric from comment #12) > (In reply to Simon Pilgrim from comment #11) > > (In reply to Simon Pilgrim from comment #10) > > > I have a fix (add missing DEFS/USES for the FPSW reg) - I'll create a > > > simpler test case and put it up for review tomorrow. > > > > That fix fails on EXPENSIVE_CHECKS builds, so I don't have a solution right > > now. > > > > We can make a minor tweak to the sandy bridge model to hide the problem - > > disabling the FLDCW16m/FNSTCW16m entries is enough. > > What will be the side effects of such a change? Side effect is that the problem (instructions being moved across a control word instruction) is still there and likely to go off at some other point now that teams are starting to improve their scheduler models, affecting x87 instruction ordering.
Ok, so to fix this, https://reviews.llvm.org/rL310792 should be merged into 5.0.0. It is pretty big, but it's a continuation of r307529 which is already in 5.0.0, so I'd really like to have it. Gadi, do you think this is OK? Or do we need to let r310792 bake for a while, still?
I guess it should be ok. Note that I ran various tests before submitting the revision. I will re-run them with your change. Note that we are still missing the scheduling of some SNB instructions - which I hope to finish gathering after committing the main HSW and SKL scheduling.
I'll let r310792 sit in tree a bit more before merging it.
(In reply to Hans Wennborg from comment #16) > I'll let r310792 sit in tree a bit more before merging it. I'm waiting for follow-up comments on https://reviews.llvm.org/D36388 to be resolved.
Perhaps we should try to revert r307529 on the branch instead?
(In reply to Hans Wennborg from comment #18) > Perhaps we should try to revert r307529 on the branch instead? Yeah, maybe that is a safer solution. Before r307529, the scheduler seems to have worked okay enough... :)
(In reply to Dimitry Andric from comment #19) > (In reply to Hans Wennborg from comment #18) > > Perhaps we should try to revert r307529 on the branch instead? > > Yeah, maybe that is a safer solution. Before r307529, the scheduler seems > to have worked okay enough... :) I've reverted it in r311600.
(In reply to Hans Wennborg from comment #20) > (In reply to Dimitry Andric from comment #19) > > (In reply to Hans Wennborg from comment #18) > > > Perhaps we should try to revert r307529 on the branch instead? > > > > Yeah, maybe that is a safer solution. Before r307529, the scheduler seems > > to have worked okay enough... :) > > I've reverted it in r311600. Thanks, I have verified that both my simplified test cases, and the full libm regression tests now all succeed again.
Compiling for i386, thus no SSE on OpenBSD, I see that fmuls is moved over an fldcw and causes the same issue. I guess this wasn‘t fixed with the sandy bridge changes?
(In reply to Patrick Wildt from comment #22) > Compiling for i386, thus no SSE on OpenBSD, I see that fmuls is moved over > an fldcw and causes the same issue. I guess this wasn‘t fixed with the sandy > bridge changes? No the underlying problem is still there and I don't think anyone has found the root cause - I think Bug #21160 is the same but I have no proof. If you have a different testcase it'd be useful for comparison.
Sorry, to elaborate. I have an OpenBSD/i386 machine with clang 4.0, clang 5.0 and the latest clang 6.0 git checkout. All seem the have the same behaviour. With -O2, compiling sqlite3 (or a smaller C code of that), some fmul is moved over: Bad: .loc 4 35 41 # sqlite3.c:35:41 fnstcw (%esp) movw (%esp), %ax movw $3199, (%esp) # imm = 0xC7F fldcw (%esp) movw %ax, (%esp) .loc 4 35 13 # sqlite3.c:35:13 movl %ecx, %eax sarl $31, %eax .loc 4 35 61 # sqlite3.c:35:61 fmuls .LCPI0_2@GOTOFF(%ebx) .loc 4 35 41 # sqlite3.c:35:41 fistpll 16(%esp) fldcw (%esp) Good: .loc 4 35 60 # sqlite3.c:35:60 fldl 28(%ecx) .loc 4 35 61 # sqlite3.c:35:61 flds .LCPI0_2@GOTOFF(%eax) fmulp %st(1) .loc 4 35 41 # sqlite3.c:35:41 fnstcw 28(%esp) movw 28(%esp), %si movw $3199, 28(%esp) # imm = 0xC7F fldcw 28(%esp) movw %si, 28(%esp) fistpll 32(%esp) fldcw 28(%esp) movl 32(%esp), %ebx movl 36(%esp), %eax .loc 4 35 39 # sqlite3.c:35:39 This is part of -O0 vs -O1. In the good example, flds/fmulp is happening before fldcw/fistpll/flcdw. In the bad example, it becomes flcdw/fmuls/fistpll/fldcw. /var/tmp/pwildt/w-llvm/fake-i386/usr/local/bin/clang -O2 -g -Wall -v -c sqlite3.c clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 0604b824fdd2e3b0c6705a4b7aabc532ebb4613e) (https://github.com/llvm-mirror/llvm.git eb585b8b48cd3cec08da5fcf5411a3b4c8ee5e33) Target: i386-unknown-openbsd6.2 Thread model: posix InstalledDir: /var/tmp/pwildt/w-llvm/fake-i386/usr/local/bin "/tmp/pwildt/w-llvm/fake-i386/usr/local/bin/clang-6.0" -cc1 -triple i386-unknown-openbsd6.2 -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name sqlite3.c -mrelocation-model pic -pic-level 1 -pic-is-pie -mthread-model posix -mdisable-fp-elim -relaxed-aliasing -masm-verbose -mconstructor-aliases -target-cpu i486 -dwarf-column-info -debug-info-kind=limited -dwarf-version=2 -debugger-tuning=gdb -v -coverage-notes-file /home/pwildt/sqlite3/sqlite3.gcno -resource-dir /tmp/pwildt/w-llvm/fake-i386/usr/local/lib/clang/6.0.0 -O2 -Wall -fdebug-compilation-dir /home/pwildt/sqlite3 -ferror-limit 19 -fmessage-length 136 -femulated-tls -fwrapv -stack-protector 2 -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -fobjc-runtime=gnustep -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o sqlite3.o -x c sqlite3.c clang -cc1 version 6.0.0 based upon LLVM 6.0.0svn default target i386-unknown-openbsd6.2
Created attachment 19586 [details] Sqlite3-based Test-Case
(In reply to Simon Pilgrim from comment #23) > (In reply to Patrick Wildt from comment #22) > > Compiling for i386, thus no SSE on OpenBSD, I see that fmuls is moved over > > an fldcw and causes the same issue. I guess this wasn‘t fixed with the sandy > > bridge changes? > > No the underlying problem is still there and I don't think anyone has found > the root cause - I think Bug #21160 is the same but I have no proof. If you > have a different testcase it'd be useful for comparison. Thank you for taking the time to reply. I have attached our test case, which is an (in comparison) small part of sqlite3. The issue arises in the function computeJD(), p->JD becomes incorrect. Good output: $ TZ=GMT ./test x = 0000-00-00 00:00:0.000000 (210866760000000) p = 1970-01-01 00:00:0.000000 (210866760000000) x = 1970-01-01 00:00:0.000000 (210866760000000) t = 0 tm = { 70/01/01 00:00:00+0 } p = 1970-01-01 00:00:0.000000 (210866760000000) y = 1970-01-01 00:00:0.000000 (210866760000000) yJD = 210866760000000, xJD=210866760000000, offset = 0 Result: 1970-01-01 00:00:00 Bad output: $ TZ=GMT ./test x = 0000-00-00 00:00:0.000000 (210866760000000) p = 1970-01-01 00:00:0.000000 (210866760000000) x = 1970-01-01 00:00:0.000000 (210866760000000) t = 0 tm = { 70/01/01 00:00:00+0 } p = 1970-01-01 00:00:0.000000 (210866754551808) y = 1970-01-01 00:00:0.000000 (210866754551808) yJD = 210866754551808, xJD=210866760000000, offset = -5448192 Result: 1969-12-31 22:29:11
rL321424 should finally fix this, it marks the pseudo x87 memory folded instructions as having side effects, the same as we do for the actual x87 opcodes. This should prevent the scheduler from moving these instructions across the fldcw sequences that has been causing the rounding issues.
Yup, that fixed my test case, thank you and merry christmas! :)
Thanks everyone! This landed before 6.0 branch, so will be part of 6.0.0.