I'm seeing a miscompilation triggered by -fslp-vectorize. Consider: #include <stdint.h> #include <immintrin.h> #include <stdio.h> struct Pair { __m256i lo; __m256i hi; }; static inline int64_t Extract(const Pair& v, int index) { return index < 4 ? v.lo[index] : v.hi[index - 4]; } // This function gets miscompiled. This is a lot like _mm512_permutexvar_epi64. // It takes a pair of __m256i along with an array of indexes and returns a // permutation in __m256i. __attribute__((noinline)) __m256i Permute(Pair a, __m256i map) { int64_t result[] = {0, 0, 0, 0}; _mm256_storeu_si256(reinterpret_cast<__m256i *>(result), a.lo); result[0] = Extract(a, map[0] & 0x7); result[1] = Extract(a, map[1] & 0x7); result[2] = Extract(a, map[2] & 0x7); result[3] = Extract(a, map[3] & 0x7); return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(result)); } int main() { Pair v; v.lo = _mm256_set_epi64x(0xa3, 0xa2, 0xa1, 0xa0); v.hi = _mm256_set_epi64x(0xa7, 0xa6, 0xa5, 0xa4); __m256i r = Permute(v, _mm256_set_epi64x(2, 3, 4, 5)); printf("%02x %02x %02x %02x\n", (int)_mm256_extract_epi64(r, 3), (int)_mm256_extract_epi64(r, 2), (int)_mm256_extract_epi64(r, 1), (int)_mm256_extract_epi64(r, 0)); return 0; } With the latest clang (a0ca4c46ca35957a38a6023fa84afda2fc9ba0ec), I see: $ ./release/bin/clang++ -O3 -mavx -fno-slp-vectorize permute.cc ; ./a.out a2 a3 a4 a5 $ ./release/bin/clang++ -O3 -mavx -fslp-vectorize permute.cc ; ./a.out a0 a3 a4 a5 Notice that the top lane is different -- 0xa0 v.s. 0xa2. Here is the assembly output for Permute: .text .file "permute.cc" .globl _Z7Permute4PairDv4_x # -- Begin function _Z7Permute4PairDv4_x .p2align 4, 0x90 .type _Z7Permute4PairDv4_x,@function _Z7Permute4PairDv4_x: # @_Z7Permute4PairDv4_x .cfi_startproc # %bb.0: pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset %rbp, -16 movq %rsp, %rbp .cfi_def_cfa_register %rbp andq $-32, %rsp subq $96, %rsp vextractf128 $1, %ymm0, %xmm1 vpextrd $2, %xmm1, %eax movl %eax, %edx andl $7, %edx vmovaps 16(%rbp), %ymm2 vmovaps 48(%rbp), %ymm3 vmovaps %ymm2, (%rsp) vmovaps %ymm3, 32(%rsp) vmovd %xmm1, %ecx subl $4, %edx jae .LBB0_1 # %bb.2: andl $3, %eax movq (%rsp,%rax,8), %rax jmp .LBB0_3 .LBB0_1: andl $3, %edx movq 32(%rsp,%rdx,8), %rax .LBB0_3: movl %ecx, %esi andl $7, %esi vpextrd $2, %xmm0, %edx subl $4, %esi jae .LBB0_4 # %bb.5: andl $3, %ecx movq (%rsp,%rcx,8), %rcx jmp .LBB0_6 .LBB0_4: andl $3, %esi movq 32(%rsp,%rsi,8), %rcx .LBB0_6: movl %edx, %edi andl $7, %edi vmovd %xmm0, %esi subl $4, %edi jae .LBB0_7 # %bb.8: andl $3, %edx movq (%rsp,%rdx,8), %rdx movl %esi, %edi andl $7, %edi subl $4, %edi jb .LBB0_11 .LBB0_10: andl $3, %edi movq 32(%rsp,%rdi,8), %rsi jmp .LBB0_12 .LBB0_7: andl $3, %edi movq 32(%rsp,%rdi,8), %rdx movl %esi, %edi andl $7, %edi subl $4, %edi jae .LBB0_10 .LBB0_11: andl $3, %esi movq (%rsp,%rsi,8), %rsi .LBB0_12: vmovq %rax, %xmm0 vmovq %rcx, %xmm1 vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0] vmovq %rdx, %xmm1 vmovq %rsi, %xmm2 vpunpcklqdq %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[0] vinsertf128 $1, %xmm0, %ymm1, %ymm0 movq %rbp, %rsp popq %rbp .cfi_def_cfa %rsp, 8 retq
This was just fixed with: commit ab2c499d3a2ed3d3e13d96e456c57fb35a114b31 Author: Anton Afanasyev <anton.a.afanasyev@gmail.com> Date: Tue Mar 16 10:23:13 2021 +0300 [SLP] Add insertelement instructions to vectorizable tree