50500 – Miscompile with SLP vectorizer + instcombine

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 50500 - Miscompile with SLP vectorizer + instcombine

Summary: Miscompile with SLP vectorizer + instcombine

Status:	RESOLVED FIXED

Alias:	None

Product:	libraries
Classification:	Unclassified
Component:	Scalar Optimizations (show other bugs)
Version:	trunk
Hardware:	PC Linux

Importance:	P enhancement
Assignee:	Sanjay Patel

URL:
Keywords:

Depends on:
Blocks:

Reported:	2021-05-26 16:46 PDT by Jordan Rupprecht
Modified:	2021-05-30 04:24 PDT (History)
CC List:	7 users (show)

See Also:	49832
Fixed By Commit(s):	7bb8bfa0622b

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Jordan Rupprecht 2021-05-26 16:46:09 PDT

D101191 in instcombine exposed a miscompile on an internal test, which goes away when disabling SLP vectorizer.

The following generates the incorrect result with "clang++ -msse4.2 -O2 repro.cc", but the correct result with "clang++ -msse4.2 -O2 repro.cc -fno-slp-vectorize":

```
#include <cstdint>
#include <cstdio>

// Needed to reproduce the issue with a small sample
template <class T>
void DoNotOptimize(const T& var) {
  asm volatile("" : "+m"(const_cast<T&>(var)));
}

struct Stats {
  int64_t x = 0;
  int64_t total = 0;
};

Stats GetStats(float a, float b) {
  float c = b ? a / b : 0.0;
  bool is_outside_a = a <= 0 || a >= 2000;
  bool is_outside_b = b <= 1 || b >= 2;
  bool is_outside_c = c <= 50 || c >= 1000;

  Stats stats;
  stats.x = is_outside_b && is_outside_c ? 1 : 0;
  stats.total = (is_outside_a || is_outside_b) ? 1 : 0;
  DoNotOptimize(stats);
  return stats;
}

int main() {
  float a = 1000.0;
  float b = 0.1;
  DoNotOptimize(a);
  DoNotOptimize(b);
  const auto stats = GetStats(a, b);
  printf("total = %ld\n", stats.total);
  return stats.total == 1 ? 0 : 1;
}
```

In this example, a is 1000 (inside the first range), b is 0.1 (outside the range), and c is 10000 (outside the range). Therefore:
- is_outside_a == false
- is_outside_b == true
- is_outside_c == true
- is_outside_b && is_outside_c == true && true == true
- is_outside_a || is_outside_b == false || true == true

However, with SLP vectorizer enabled, stats.total is 0, when it should be 1.

The following IR was reduced from the C++ above, and can reproduce the difference with the same invocations:

```
; ModuleID = 'repro.ll'
source_filename = "repro.cc"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct.widget = type { i64, i64 }

@global = private unnamed_addr constant [13 x i8] c"total = %ld\0A\00", align 1

define { i64, i64 } @baz(float %arg, float %arg1) local_unnamed_addr #0 {
bb:
  %tmp = alloca %struct.widget, align 8
  %tmp2 = fcmp une float %arg1, 0.000000e+00
  %tmp3 = fdiv float %arg, %arg1
  %tmp4 = select i1 %tmp2, float %tmp3, float 0.000000e+00
  %tmp5 = fcmp ole float %arg, 0.000000e+00
  %tmp6 = fcmp oge float %arg, 2.000000e+03
  %tmp7 = or i1 %tmp5, %tmp6
  %tmp8 = fcmp ole float %arg1, 1.000000e+00
  %tmp9 = fcmp oge float %arg1, 2.000000e+00
  %tmp10 = or i1 %tmp8, %tmp9
  %tmp11 = fcmp ole float %tmp4, 5.000000e+01
  %tmp12 = fcmp oge float %tmp4, 1.000000e+03
  %tmp13 = or i1 %tmp11, %tmp12
  %tmp14 = select i1 %tmp10, i1 %tmp13, i1 false
  %tmp15 = zext i1 %tmp14 to i64
  %tmp16 = getelementptr inbounds %struct.widget, %struct.widget* %tmp, i64 0, i32 0
  store i64 %tmp15, i64* %tmp16, align 8
  %tmp17 = select i1 %tmp7, i1 true, i1 %tmp10
  %tmp18 = zext i1 %tmp17 to i64
  %tmp19 = getelementptr inbounds %struct.widget, %struct.widget* %tmp, i64 0, i32 1
  store i64 %tmp18, i64* %tmp19, align 8
  call void asm sideeffect "", "=*m,*m,~{dirflag},~{fpsr},~{flags}"(%struct.widget* %tmp, %struct.widget* %tmp)
  %tmp20 = load i64, i64* %tmp19, align 8
  %tmp21 = insertvalue { i64, i64 } undef, i64 %tmp20, 1
  ret { i64, i64 } %tmp21
}

define i32 @main() local_unnamed_addr {
bb:
  %tmp = alloca float, align 4
  store float 1.000000e+03, float* %tmp, align 4
  %tmp1 = load float, float* %tmp, align 4
  %tmp2 = call { i64, i64 } @baz(float %tmp1, float undef)
  %tmp3 = extractvalue { i64, i64 } %tmp2, 1
  %tmp4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @global, i64 0, i64 0), i64 %tmp3)
  ret i32 undef
}

declare i32 @printf(i8*, ...) local_unnamed_addr

attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" }
```

Comment 1 Sanjay Patel 2021-05-29 07:48:09 PDT

Looks like another partial-undef vector-select bug in instcombine. I'll reduce and step through.

Comment 2 Sanjay Patel 2021-05-30 04:24:13 PDT

As with bug 49832, it's more general than partial undef - we don't account for vector transforms in our equivalence substituation logic:
https://reviews.llvm.org/rG7bb8bfa0622b

I audited callers and don't see any other places we could hit this, so I put an assert in the simplify code. 

If we want this kind of transform for vectors, we might try to handle splats only, but I don't see any evidence for it.

I think this bug was completely hidden before D101191, so it doesn't affect release branches, but feel free to reopen if that's not right.