Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrong code generated with -fslp-vectorize #49667

Closed
kazutakahirata opened this issue May 13, 2021 · 3 comments
Closed

Wrong code generated with -fslp-vectorize #49667

kazutakahirata opened this issue May 13, 2021 · 3 comments
Labels
bugzilla Issues migrated from bugzilla worksforme Resolved as "works for me"

Comments

@kazutakahirata
Copy link
Contributor

Bugzilla Link 50323
Resolution WORKSFORME
Resolved on May 13, 2021 14:41
Version trunk
OS Linux
CC @topperc,@slackito,@RKSimon,@phoebewang,@rotateright

Extended Description

I'm seeing a miscompilation triggered by -fslp-vectorize.

Consider:

#include <stdint.h>
#include <immintrin.h>
#include <stdio.h>

struct Pair {
__m256i lo;
__m256i hi;
};

static inline int64_t Extract(const Pair& v, int index) {
return index < 4 ? v.lo[index] : v.hi[index - 4];
}

// This function gets miscompiled. This is a lot like _mm512_permutexvar_epi64.
// It takes a pair of __m256i along with an array of indexes and returns a
// permutation in __m256i.
attribute((noinline)) __m256i Permute(Pair a, __m256i map) {
int64_t result[] = {0, 0, 0, 0};
_mm256_storeu_si256(reinterpret_cast<__m256i *>(result), a.lo);
result[0] = Extract(a, map[0] & 0x7);
result[1] = Extract(a, map[1] & 0x7);
result[2] = Extract(a, map[2] & 0x7);
result[3] = Extract(a, map[3] & 0x7);
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(result));
}

int main() {
Pair v;
v.lo = _mm256_set_epi64x(0xa3, 0xa2, 0xa1, 0xa0);
v.hi = _mm256_set_epi64x(0xa7, 0xa6, 0xa5, 0xa4);
__m256i r = Permute(v, _mm256_set_epi64x(2, 3, 4, 5));
printf("%02x %02x %02x %02x\n",
(int)_mm256_extract_epi64(r, 3),
(int)_mm256_extract_epi64(r, 2),
(int)_mm256_extract_epi64(r, 1),
(int)_mm256_extract_epi64(r, 0));
return 0;
}

With the latest clang (a0ca4c4), I see:

$ ./release/bin/clang++ -O3 -mavx -fno-slp-vectorize permute.cc ; ./a.out
a2 a3 a4 a5
$ ./release/bin/clang++ -O3 -mavx -fslp-vectorize permute.cc ; ./a.out
a0 a3 a4 a5

Notice that the top lane is different -- 0xa0 v.s. 0xa2.

Here is the assembly output for Permute:

.text
.file	"permute.cc"
.globl	_Z7Permute4PairDv4_x            # -- Begin function _Z7Permute4PairDv4_x
.p2align	4, 0x90
.type	_Z7Permute4PairDv4_x,@function

_Z7Permute4PairDv4_x: # @​_Z7Permute4PairDv4_x
.cfi_startproc

%bb.0:

pushq	%rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq	%rsp, %rbp
.cfi_def_cfa_register %rbp
andq	$-32, %rsp
subq	$96, %rsp
vextractf128	$1, %ymm0, %xmm1
vpextrd	$2, %xmm1, %eax
movl	%eax, %edx
andl	$7, %edx
vmovaps	16(%rbp), %ymm2
vmovaps	48(%rbp), %ymm3
vmovaps	%ymm2, (%rsp)
vmovaps	%ymm3, 32(%rsp)
vmovd	%xmm1, %ecx
subl	$4, %edx
jae	.LBB0_1

%bb.2:

andl	$3, %eax
movq	(%rsp,%rax,8), %rax
jmp	.LBB0_3

.LBB0_1:
andl $3, %edx
movq 32(%rsp,%rdx,8), %rax
.LBB0_3:
movl %ecx, %esi
andl $7, %esi
vpextrd $2, %xmm0, %edx
subl $4, %esi
jae .LBB0_4

%bb.5:

andl	$3, %ecx
movq	(%rsp,%rcx,8), %rcx
jmp	.LBB0_6

.LBB0_4:
andl $3, %esi
movq 32(%rsp,%rsi,8), %rcx
.LBB0_6:
movl %edx, %edi
andl $7, %edi
vmovd %xmm0, %esi
subl $4, %edi
jae .LBB0_7

%bb.8:

andl	$3, %edx
movq	(%rsp,%rdx,8), %rdx
movl	%esi, %edi
andl	$7, %edi
subl	$4, %edi
jb	.LBB0_11

.LBB0_10:
andl $3, %edi
movq 32(%rsp,%rdi,8), %rsi
jmp .LBB0_12
.LBB0_7:
andl $3, %edi
movq 32(%rsp,%rdi,8), %rdx
movl %esi, %edi
andl $7, %edi
subl $4, %edi
jae .LBB0_10
.LBB0_11:
andl $3, %esi
movq (%rsp,%rsi,8), %rsi
.LBB0_12:
vmovq %rax, %xmm0
vmovq %rcx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovq %rdx, %xmm1
vmovq %rsi, %xmm2
vpunpcklqdq %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[0]
vinsertf128 $1, %xmm0, %ymm1, %ymm0
movq %rbp, %rsp
popq %rbp
.cfi_def_cfa %rsp, 8
retq

@kazutakahirata
Copy link
Contributor Author

This was just fixed with:

commit ab2c499
Author: Anton Afanasyev anton.a.afanasyev@gmail.com
Date: Tue Mar 16 10:23:13 2021 +0300

[SLP] Add insertelement instructions to vectorizable tree

@kazutakahirata
Copy link
Contributor Author

mentioned in issue llvm/llvm-bugzilla-archive#50338

@kazutakahirata
Copy link
Contributor Author

mentioned in issue llvm/llvm-bugzilla-archive#50356

@llvmbot llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 11, 2021
@Quuxplusone Quuxplusone added the worksforme Resolved as "works for me" label Jan 20, 2022
This issue was closed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bugzilla Issues migrated from bugzilla worksforme Resolved as "works for me"
Projects
None yet
Development

No branches or pull requests

2 participants