You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
We should use broadcast and fold loads after: https://reviews.llvm.org/rG285b8abce483
...but as I mentioned in the commit message, I'm not sure if this results in a perf win even if it looks better.
Using pcmpeq to create an all-ones vector still seems like the right trade-off for most cases.
Extended Description
struct b
{
int a[100000];
};
void
plus1(struct b *a)
{
int i;
for (i=0;i<64;i++)
a->a[i]+=1;
}
Trunk -O3 -march=haswell:
plus1(b*): # @plus1(b*)
vpcmpeqd %ymm0, %ymm0, %ymm0
vmovdqu (%rdi), %ymm1
vmovdqu 32(%rdi), %ymm2
vmovdqu 64(%rdi), %ymm3
vmovdqu 96(%rdi), %ymm4
vpsubd %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, (%rdi)
vpsubd %ymm0, %ymm2, %ymm1
vmovdqu %ymm1, 32(%rdi)
vpsubd %ymm0, %ymm3, %ymm1
vmovdqu %ymm1, 64(%rdi)
vpsubd %ymm0, %ymm4, %ymm1
vmovdqu %ymm1, 96(%rdi)
vmovdqu 128(%rdi), %ymm1
vpsubd %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 128(%rdi)
vmovdqu 160(%rdi), %ymm1
vpsubd %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 160(%rdi)
vmovdqu 192(%rdi), %ymm1
vpsubd %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 192(%rdi)
vmovdqu 224(%rdi), %ymm1
vpsubd %ymm0, %ymm1, %ymm0
vmovdqu %ymm0, 224(%rdi)
vzeroupper
retq
ICC/GCC produces:
plus1(b*):
vmovdqu .L_2il0floatpacket.0(%rip), %ymm7 #12.14
vpaddd (%rdi), %ymm7, %ymm0 #12.5
vpaddd 32(%rdi), %ymm7, %ymm1 #12.5
vpaddd 64(%rdi), %ymm7, %ymm2 #12.5
vpaddd 96(%rdi), %ymm7, %ymm3 #12.5
vpaddd 128(%rdi), %ymm7, %ymm4 #12.5
vpaddd 160(%rdi), %ymm7, %ymm5 #12.5
vpaddd 192(%rdi), %ymm7, %ymm6 #12.5
vpaddd 224(%rdi), %ymm7, %ymm8 #12.5
vmovdqu %ymm0, (%rdi) #12.5
vmovdqu %ymm1, 32(%rdi) #12.5
vmovdqu %ymm2, 64(%rdi) #12.5
vmovdqu %ymm3, 96(%rdi) #12.5
vmovdqu %ymm4, 128(%rdi) #12.5
vmovdqu %ymm5, 160(%rdi) #12.5
vmovdqu %ymm6, 192(%rdi) #12.5
vmovdqu %ymm8, 224(%rdi) #12.5
vzeroupper #13.1
ret #13.1
LLVM could just use:
.LCPI0_0:
.long 1 # 0x1
plus1(b*): # @plus1(b*)
vbroadcastss .LCPI0_0(%rip), %ymm0 # ymm0 = [1,1,1,1,1,1,1,1]
https://godbolt.org/z/vx4nGcor9
The text was updated successfully, but these errors were encountered: