Wrong code generated with -fslp-vectorize #49667

kazutakahirata · 2021-05-13T04:14:52Z


Bugzilla Link	50323
Resolution	WORKSFORME
Resolved on	May 13, 2021 14:41
Version	trunk
OS	Linux
CC	@topperc,@slackito,@RKSimon,@phoebewang,@rotateright

Extended Description

I'm seeing a miscompilation triggered by -fslp-vectorize.

Consider:

#include <stdint.h>
#include <immintrin.h>
#include <stdio.h>

struct Pair {
__m256i lo;
__m256i hi;
};

static inline int64_t Extract(const Pair& v, int index) {
return index < 4 ? v.lo[index] : v.hi[index - 4];
}

// This function gets miscompiled. This is a lot like _mm512_permutexvar_epi64.
// It takes a pair of __m256i along with an array of indexes and returns a
// permutation in __m256i.
attribute((noinline)) __m256i Permute(Pair a, __m256i map) {
int64_t result[] = {0, 0, 0, 0};
_mm256_storeu_si256(reinterpret_cast<__m256i *>(result), a.lo);
result[0] = Extract(a, map[0] & 0x7);
result[1] = Extract(a, map[1] & 0x7);
result[2] = Extract(a, map[2] & 0x7);
result[3] = Extract(a, map[3] & 0x7);
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(result));
}

int main() {
Pair v;
v.lo = _mm256_set_epi64x(0xa3, 0xa2, 0xa1, 0xa0);
v.hi = _mm256_set_epi64x(0xa7, 0xa6, 0xa5, 0xa4);
__m256i r = Permute(v, _mm256_set_epi64x(2, 3, 4, 5));
printf("%02x %02x %02x %02x\n",
(int)_mm256_extract_epi64(r, 3),
(int)_mm256_extract_epi64(r, 2),
(int)_mm256_extract_epi64(r, 1),
(int)_mm256_extract_epi64(r, 0));
return 0;
}

With the latest clang (a0ca4c4), I see:

$ ./release/bin/clang++ -O3 -mavx -fno-slp-vectorize permute.cc ; ./a.out
a2 a3 a4 a5
$ ./release/bin/clang++ -O3 -mavx -fslp-vectorize permute.cc ; ./a.out
a0 a3 a4 a5

Notice that the top lane is different -- 0xa0 v.s. 0xa2.

Here is the assembly output for Permute:

.text
.file	"permute.cc"
.globl	_Z7Permute4PairDv4_x            # -- Begin function _Z7Permute4PairDv4_x
.p2align	4, 0x90
.type	_Z7Permute4PairDv4_x,@function

_Z7Permute4PairDv4_x: # @_Z7Permute4PairDv4_x
.cfi_startproc

%bb.0:

pushq	%rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq	%rsp, %rbp
.cfi_def_cfa_register %rbp
andq	$-32, %rsp
subq	$96, %rsp
vextractf128	$1, %ymm0, %xmm1
vpextrd	$2, %xmm1, %eax
movl	%eax, %edx
andl	$7, %edx
vmovaps	16(%rbp), %ymm2
vmovaps	48(%rbp), %ymm3
vmovaps	%ymm2, (%rsp)
vmovaps	%ymm3, 32(%rsp)
vmovd	%xmm1, %ecx
subl	$4, %edx
jae	.LBB0_1

%bb.2:

andl	$3, %eax
movq	(%rsp,%rax,8), %rax
jmp	.LBB0_3

.LBB0_1:
andl $3, %edx
movq 32(%rsp,%rdx,8), %rax
.LBB0_3:
movl %ecx, %esi
andl $7, %esi
vpextrd $2, %xmm0, %edx
subl $4, %esi
jae .LBB0_4

%bb.5:

andl	$3, %ecx
movq	(%rsp,%rcx,8), %rcx
jmp	.LBB0_6

.LBB0_4:
andl $3, %esi
movq 32(%rsp,%rsi,8), %rcx
.LBB0_6:
movl %edx, %edi
andl $7, %edi
vmovd %xmm0, %esi
subl $4, %edi
jae .LBB0_7

%bb.8:

andl	$3, %edx
movq	(%rsp,%rdx,8), %rdx
movl	%esi, %edi
andl	$7, %edi
subl	$4, %edi
jb	.LBB0_11

.LBB0_10:
andl $3, %edi
movq 32(%rsp,%rdi,8), %rsi
jmp .LBB0_12
.LBB0_7:
andl $3, %edi
movq 32(%rsp,%rdi,8), %rdx
movl %esi, %edi
andl $7, %edi
subl $4, %edi
jae .LBB0_10
.LBB0_11:
andl $3, %esi
movq (%rsp,%rsi,8), %rsi
.LBB0_12:
vmovq %rax, %xmm0
vmovq %rcx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovq %rdx, %xmm1
vmovq %rsi, %xmm2
vpunpcklqdq %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[0]
vinsertf128 $1, %xmm0, %ymm1, %ymm0
movq %rbp, %rsp
popq %rbp
.cfi_def_cfa %rsp, 8
retq

The text was updated successfully, but these errors were encountered:

kazutakahirata · 2021-05-13T05:51:23Z

This was just fixed with:

commit ab2c499
Author: Anton Afanasyev anton.a.afanasyev@gmail.com
Date: Tue Mar 16 10:23:13 2021 +0300

[SLP] Add insertelement instructions to vectorizable tree

kazutakahirata · 2021-11-27T04:23:49Z

mentioned in issue llvm/llvm-bugzilla-archive#50338

kazutakahirata · 2021-11-27T04:24:02Z

mentioned in issue llvm/llvm-bugzilla-archive#50356

llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 11, 2021

Quuxplusone added the worksforme Resolved as "works for me" label Jan 20, 2022

This issue was closed.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Wrong code generated with -fslp-vectorize #49667

Wrong code generated with -fslp-vectorize #49667

kazutakahirata commented May 13, 2021

kazutakahirata commented May 13, 2021

kazutakahirata commented Nov 27, 2021

kazutakahirata commented Nov 27, 2021

Wrong code generated with -fslp-vectorize #49667

Wrong code generated with -fslp-vectorize #49667

Comments

kazutakahirata commented May 13, 2021

Extended Description

%bb.0:

%bb.2:

%bb.5:

%bb.8:

kazutakahirata commented May 13, 2021

kazutakahirata commented Nov 27, 2021

kazutakahirata commented Nov 27, 2021