50323 – Wrong code generated with -fslp-vectorize

LLVM Bugzilla is read-only and represents the historical archive of all LLVM issues filled before November 26, 2021. Use github to submit LLVM bugs

Bug 50323 - Wrong code generated with -fslp-vectorize

Summary: Wrong code generated with -fslp-vectorize

Status:	RESOLVED WORKSFORME

Alias:	None

Product:	libraries
Classification:	Unclassified
Component:	Scalar Optimizations (show other bugs)
Version:	trunk
Hardware:	PC Linux

Importance:	P normal
Assignee:	Unassigned LLVM Bugs

URL:
Keywords:

Depends on:
Blocks:

Reported:	2021-05-12 21:14 PDT by Kazu Hirata
Modified:	2021-05-13 14:41 PDT (History)
CC List:	6 users (show)

See Also:
Fixed By Commit(s):

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Kazu Hirata 2021-05-12 21:14:52 PDT

I'm seeing a miscompilation triggered by -fslp-vectorize.

Consider:

#include <stdint.h>
#include <immintrin.h>
#include <stdio.h>

struct Pair {
  __m256i lo;
  __m256i hi;
};

static inline int64_t Extract(const Pair& v, int index) {
  return index < 4 ? v.lo[index] : v.hi[index - 4];
}

// This function gets miscompiled.  This is a lot like _mm512_permutexvar_epi64.
// It takes a pair of __m256i along with an array of indexes and returns a
// permutation in __m256i.
__attribute__((noinline)) __m256i Permute(Pair a, __m256i map) {
  int64_t result[] = {0, 0, 0, 0};
  _mm256_storeu_si256(reinterpret_cast<__m256i *>(result), a.lo);
  result[0] = Extract(a, map[0] & 0x7);
  result[1] = Extract(a, map[1] & 0x7);
  result[2] = Extract(a, map[2] & 0x7);
  result[3] = Extract(a, map[3] & 0x7);
  return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(result));
}

int main() {
  Pair v;
  v.lo = _mm256_set_epi64x(0xa3, 0xa2, 0xa1, 0xa0);
  v.hi = _mm256_set_epi64x(0xa7, 0xa6, 0xa5, 0xa4);
  __m256i r = Permute(v, _mm256_set_epi64x(2, 3, 4, 5));
  printf("%02x %02x %02x %02x\n",
         (int)_mm256_extract_epi64(r, 3),
         (int)_mm256_extract_epi64(r, 2),
         (int)_mm256_extract_epi64(r, 1),
         (int)_mm256_extract_epi64(r, 0));
  return 0;
}

With the latest clang (a0ca4c46ca35957a38a6023fa84afda2fc9ba0ec), I see:

$ ./release/bin/clang++ -O3 -mavx -fno-slp-vectorize permute.cc ; ./a.out
a2 a3 a4 a5
$ ./release/bin/clang++ -O3 -mavx -fslp-vectorize    permute.cc ; ./a.out
a0 a3 a4 a5

Notice that the top lane is different -- 0xa0 v.s. 0xa2.

Here is the assembly output for Permute:

	.text
	.file	"permute.cc"
	.globl	_Z7Permute4PairDv4_x            # -- Begin function _Z7Permute4PairDv4_x
	.p2align	4, 0x90
	.type	_Z7Permute4PairDv4_x,@function
_Z7Permute4PairDv4_x:                   # @_Z7Permute4PairDv4_x
	.cfi_startproc
# %bb.0:
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	andq	$-32, %rsp
	subq	$96, %rsp
	vextractf128	$1, %ymm0, %xmm1
	vpextrd	$2, %xmm1, %eax
	movl	%eax, %edx
	andl	$7, %edx
	vmovaps	16(%rbp), %ymm2
	vmovaps	48(%rbp), %ymm3
	vmovaps	%ymm2, (%rsp)
	vmovaps	%ymm3, 32(%rsp)
	vmovd	%xmm1, %ecx
	subl	$4, %edx
	jae	.LBB0_1
# %bb.2:
	andl	$3, %eax
	movq	(%rsp,%rax,8), %rax
	jmp	.LBB0_3
.LBB0_1:
	andl	$3, %edx
	movq	32(%rsp,%rdx,8), %rax
.LBB0_3:
	movl	%ecx, %esi
	andl	$7, %esi
	vpextrd	$2, %xmm0, %edx
	subl	$4, %esi
	jae	.LBB0_4
# %bb.5:
	andl	$3, %ecx
	movq	(%rsp,%rcx,8), %rcx
	jmp	.LBB0_6
.LBB0_4:
	andl	$3, %esi
	movq	32(%rsp,%rsi,8), %rcx
.LBB0_6:
	movl	%edx, %edi
	andl	$7, %edi
	vmovd	%xmm0, %esi
	subl	$4, %edi
	jae	.LBB0_7
# %bb.8:
	andl	$3, %edx
	movq	(%rsp,%rdx,8), %rdx
	movl	%esi, %edi
	andl	$7, %edi
	subl	$4, %edi
	jb	.LBB0_11
.LBB0_10:
	andl	$3, %edi
	movq	32(%rsp,%rdi,8), %rsi
	jmp	.LBB0_12
.LBB0_7:
	andl	$3, %edi
	movq	32(%rsp,%rdi,8), %rdx
	movl	%esi, %edi
	andl	$7, %edi
	subl	$4, %edi
	jae	.LBB0_10
.LBB0_11:
	andl	$3, %esi
	movq	(%rsp,%rsi,8), %rsi
.LBB0_12:
	vmovq	%rax, %xmm0
	vmovq	%rcx, %xmm1
	vpunpcklqdq	%xmm0, %xmm1, %xmm0     # xmm0 = xmm1[0],xmm0[0]
	vmovq	%rdx, %xmm1
	vmovq	%rsi, %xmm2
	vpunpcklqdq	%xmm1, %xmm2, %xmm1     # xmm1 = xmm2[0],xmm1[0]
	vinsertf128	$1, %xmm0, %ymm1, %ymm0
	movq	%rbp, %rsp
	popq	%rbp
	.cfi_def_cfa %rsp, 8
	retq

Comment 1 Kazu Hirata 2021-05-12 22:51:23 PDT

This was just fixed with:

commit ab2c499d3a2ed3d3e13d96e456c57fb35a114b31
Author: Anton Afanasyev <anton.a.afanasyev@gmail.com>
Date:   Tue Mar 16 10:23:13 2021 +0300

    [SLP] Add insertelement instructions to vectorizable tree