You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
static inline int64_t Extract(const Pair& v, int index) {
return index < 4 ? v.lo[index] : v.hi[index - 4];
}
// This function gets miscompiled. This is a lot like _mm512_permutexvar_epi64.
// It takes a pair of __m256i along with an array of indexes and returns a
// permutation in __m256i. attribute((noinline)) __m256i Permute(Pair a, __m256i map) {
int64_t result[] = {0, 0, 0, 0};
_mm256_storeu_si256(reinterpret_cast<__m256i *>(result), a.lo);
result[0] = Extract(a, map[0] & 0x7);
result[1] = Extract(a, map[1] & 0x7);
result[2] = Extract(a, map[2] & 0x7);
result[3] = Extract(a, map[3] & 0x7);
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(result));
}
Extended Description
I'm seeing a miscompilation triggered by -fslp-vectorize.
Consider:
#include <stdint.h>
#include <immintrin.h>
#include <stdio.h>
struct Pair {
__m256i lo;
__m256i hi;
};
static inline int64_t Extract(const Pair& v, int index) {
return index < 4 ? v.lo[index] : v.hi[index - 4];
}
// This function gets miscompiled. This is a lot like _mm512_permutexvar_epi64.
// It takes a pair of __m256i along with an array of indexes and returns a
// permutation in __m256i.
attribute((noinline)) __m256i Permute(Pair a, __m256i map) {
int64_t result[] = {0, 0, 0, 0};
_mm256_storeu_si256(reinterpret_cast<__m256i *>(result), a.lo);
result[0] = Extract(a, map[0] & 0x7);
result[1] = Extract(a, map[1] & 0x7);
result[2] = Extract(a, map[2] & 0x7);
result[3] = Extract(a, map[3] & 0x7);
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(result));
}
int main() {
Pair v;
v.lo = _mm256_set_epi64x(0xa3, 0xa2, 0xa1, 0xa0);
v.hi = _mm256_set_epi64x(0xa7, 0xa6, 0xa5, 0xa4);
__m256i r = Permute(v, _mm256_set_epi64x(2, 3, 4, 5));
printf("%02x %02x %02x %02x\n",
(int)_mm256_extract_epi64(r, 3),
(int)_mm256_extract_epi64(r, 2),
(int)_mm256_extract_epi64(r, 1),
(int)_mm256_extract_epi64(r, 0));
return 0;
}
With the latest clang (a0ca4c4), I see:
$ ./release/bin/clang++ -O3 -mavx -fno-slp-vectorize permute.cc ; ./a.out
a2 a3 a4 a5
$ ./release/bin/clang++ -O3 -mavx -fslp-vectorize permute.cc ; ./a.out
a0 a3 a4 a5
Notice that the top lane is different -- 0xa0 v.s. 0xa2.
Here is the assembly output for Permute:
_Z7Permute4PairDv4_x: # @_Z7Permute4PairDv4_x
.cfi_startproc
%bb.0:
%bb.2:
.LBB0_1:
andl $3, %edx
movq 32(%rsp,%rdx,8), %rax
.LBB0_3:
movl %ecx, %esi
andl $7, %esi
vpextrd $2, %xmm0, %edx
subl $4, %esi
jae .LBB0_4
%bb.5:
.LBB0_4:
andl $3, %esi
movq 32(%rsp,%rsi,8), %rcx
.LBB0_6:
movl %edx, %edi
andl $7, %edi
vmovd %xmm0, %esi
subl $4, %edi
jae .LBB0_7
%bb.8:
.LBB0_10:
andl $3, %edi
movq 32(%rsp,%rdi,8), %rsi
jmp .LBB0_12
.LBB0_7:
andl $3, %edi
movq 32(%rsp,%rdi,8), %rdx
movl %esi, %edi
andl $7, %edi
subl $4, %edi
jae .LBB0_10
.LBB0_11:
andl $3, %esi
movq (%rsp,%rsi,8), %rsi
.LBB0_12:
vmovq %rax, %xmm0
vmovq %rcx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovq %rdx, %xmm1
vmovq %rsi, %xmm2
vpunpcklqdq %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[0]
vinsertf128 $1, %xmm0, %ymm1, %ymm0
movq %rbp, %rsp
popq %rbp
.cfi_def_cfa %rsp, 8
retq
The text was updated successfully, but these errors were encountered: