手动展开初始化平铺计算的自动矢量化与无初始化的简单循环
在优化AABB碰撞检测算法的内部最大4-versus-4比较零件时,我被困在简化代码的同时获得(或只是保留)性能。
这是具有手工摘要初始化的版本:
https://godbolt.org/godbolt.org/z/tmgmhdsssssssss
inline
const int intersectDim(const float minx, const float maxx, const float minx2, const float maxx2) noexcept
{
return !((maxx < minx2) || (maxx2 < minx));
}
inline
void comp4vs4( const int * const __restrict__ partId1, const int * const __restrict__ partId2,
const float * const __restrict__ minx1, const float * const __restrict__ minx2,
const float * const __restrict__ miny1, const float * const __restrict__ miny2,
const float * const __restrict__ minz1, const float * const __restrict__ minz2,
const float * const __restrict__ maxx1, const float * const __restrict__ maxx2,
const float * const __restrict__ maxy1, const float * const __restrict__ maxy2,
const float * const __restrict__ maxz1, const float * const __restrict__ maxz2,
int * const __restrict__ out
)
{
alignas(32)
int result[16]={
// 0v0 0v1 0v2 0v3
// 1v0 1v1 1v2 1v3
// 2v0 2v1 2v2 2v3
// 3v0 3v1 3v2 3v3
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0
};
alignas(32)
int tileId1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
partId1[0],partId1[1],partId1[2],partId1[3],
partId1[0],partId1[1],partId1[2],partId1[3],
partId1[0],partId1[1],partId1[2],partId1[3],
partId1[0],partId1[1],partId1[2],partId1[3]
};
alignas(32)
int tileId2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
partId2[0],partId2[0],partId2[0],partId2[0],
partId2[1],partId2[1],partId2[1],partId2[1],
partId2[2],partId2[2],partId2[2],partId2[2],
partId2[3],partId2[3],partId2[3],partId2[3]
};
alignas(32)
float tileMinX1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
minx1[0],minx1[1],minx1[2],minx1[3],
minx1[0],minx1[1],minx1[2],minx1[3],
minx1[0],minx1[1],minx1[2],minx1[3],
minx1[0],minx1[1],minx1[2],minx1[3]
};
alignas(32)
float tileMinX2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
minx2[0],minx2[0],minx2[0],minx2[0],
minx2[1],minx2[1],minx2[1],minx2[1],
minx2[2],minx2[2],minx2[2],minx2[2],
minx2[3],minx2[3],minx2[3],minx2[3]
};
alignas(32)
float tileMinY1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
miny1[0],miny1[1],miny1[2],miny1[3],
miny1[0],miny1[1],miny1[2],miny1[3],
miny1[0],miny1[1],miny1[2],miny1[3],
miny1[0],miny1[1],miny1[2],miny1[3]
};
alignas(32)
float tileMinY2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
miny2[0],miny2[0],miny2[0],miny2[0],
miny2[1],miny2[1],miny2[1],miny2[1],
miny2[2],miny2[2],miny2[2],miny2[2],
miny2[3],miny2[3],miny2[3],miny2[3]
};
alignas(32)
float tileMinZ1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
minz1[0],minz1[1],minz1[2],minz1[3],
minz1[0],minz1[1],minz1[2],minz1[3],
minz1[0],minz1[1],minz1[2],minz1[3],
minz1[0],minz1[1],minz1[2],minz1[3]
};
alignas(32)
float tileMinZ2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
minz2[0],minz2[0],minz2[0],minz2[0],
minz2[1],minz2[1],minz2[1],minz2[1],
minz2[2],minz2[2],minz2[2],minz2[2],
minz2[3],minz2[3],minz2[3],minz2[3]
};
alignas(32)
float tileMaxX1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
maxx1[0],maxx1[1],maxx1[2],maxx1[3],
maxx1[0],maxx1[1],maxx1[2],maxx1[3],
maxx1[0],maxx1[1],maxx1[2],maxx1[3],
maxx1[0],maxx1[1],maxx1[2],maxx1[3]
};
alignas(32)
float tileMaxX2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
maxx2[0],maxx2[0],maxx2[0],maxx2[0],
maxx2[1],maxx2[1],maxx2[1],maxx2[1],
maxx2[2],maxx2[2],maxx2[2],maxx2[2],
maxx2[3],maxx2[3],maxx2[3],maxx2[3]
};
alignas(32)
float tileMaxY1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
maxy1[0],maxy1[1],maxy1[2],maxy1[3],
maxy1[0],maxy1[1],maxy1[2],maxy1[3],
maxy1[0],maxy1[1],maxy1[2],maxy1[3],
maxy1[0],maxy1[1],maxy1[2],maxy1[3]
};
alignas(32)
float tileMaxY2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
maxy2[0],maxy2[0],maxy2[0],maxy2[0],
maxy2[1],maxy2[1],maxy2[1],maxy2[1],
maxy2[2],maxy2[2],maxy2[2],maxy2[2],
maxy2[3],maxy2[3],maxy2[3],maxy2[3]
};
alignas(32)
float tileMaxZ1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
maxz1[0],maxz1[1],maxz1[2],maxz1[3],
maxz1[0],maxz1[1],maxz1[2],maxz1[3],
maxz1[0],maxz1[1],maxz1[2],maxz1[3],
maxz1[0],maxz1[1],maxz1[2],maxz1[3]
};
alignas(32)
float tileMaxZ2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
maxz2[0],maxz2[0],maxz2[0],maxz2[0],
maxz2[1],maxz2[1],maxz2[1],maxz2[1],
maxz2[2],maxz2[2],maxz2[2],maxz2[2],
maxz2[3],maxz2[3],maxz2[3],maxz2[3]
};
for(int i=0;i<16;i++)
result[i] = (tileId1[i] < tileId2[i]);
for(int i=0;i<16;i++)
result[i] = result[i] &&
intersectDim(tileMinX1[i], tileMaxX1[i], tileMinX2[i], tileMaxX2[i]) &&
intersectDim(tileMinY1[i], tileMaxY1[i], tileMinY2[i], tileMaxY2[i]) &&
intersectDim(tileMinZ1[i], tileMaxZ1[i], tileMinZ2[i], tileMaxZ2[i]);
for(int i=0;i<16;i++)
out[i]=result[i];
}
#include<iostream>
int main()
{
int tile1[4];int tile2[4];
float tile3[4];float tile4[4];
float tile5[4];float tile6[4];
float tile7[4];float tile8[4];
float tile9[4];float tile10[4];
float tile11[4];float tile12[4];
float tile13[4];float tile14[4];
for(int i=0;i<4;i++)
{
std::cin>>tile1[i];
std::cin>>tile2[i];
std::cin>>tile3[i];
std::cin>>tile4[i];
std::cin>>tile5[i];
std::cin>>tile6[i];
std::cin>>tile7[i];
std::cin>>tile8[i];
std::cin>>tile9[i];
std::cin>>tile10[i];
std::cin>>tile11[i];
std::cin>>tile12[i];
std::cin>>tile13[i];
std::cin>>tile14[i];
}
int out[16];
comp4vs4(tile1,tile2,tile3,tile4,tile5,tile6,tile7,tile8,tile9,
tile10,tile11,tile12,tile13,tile14,out);
for(int i=0;i<16;i++)
std::cout<<out[i];
return 0;
}
它来自Godbolt的输出:
comp4vs4(int const*, int const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, int*):
push rbp
mov rbp, rsp
and rsp, -32
sub rsp, 8
mov rax, QWORD PTR [rbp+80]
vmovups xmm0, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+16]
vmovups xmm6, XMMWORD PTR [rcx]
vmovups xmm5, XMMWORD PTR [r9]
vmovups xmm9, XMMWORD PTR [r8]
vmovdqu xmm15, XMMWORD PTR [rsi]
vmovdqu xmm8, XMMWORD PTR [rdi]
vmovups xmm2, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+24]
vpermilps xmm1, xmm6, 0
vmovdqa XMMWORD PTR [rsp-88], xmm15
vmovups xmm4, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+32]
vmovups xmm14, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+40]
vmovups xmm11, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+48]
vcmpleps xmm3, xmm1, xmm14
vmovups xmm13, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+56]
vpermilps xmm1, xmm11, 0
vcmpleps xmm1, xmm0, xmm1
vmovups xmm10, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+64]
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm5, 0
vcmpleps xmm3, xmm3, xmm13
vmovups xmm7, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+72]
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm10, 0
vmovaps XMMWORD PTR [rsp-72], xmm7
vcmpleps xmm3, xmm9, xmm3
vmovups xmm7, XMMWORD PTR [rdx]
vpand xmm1, xmm1, xmm3
vpshufd xmm3, xmm15, 0
vpcomltd xmm3, xmm8, xmm3
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm7, 0
vcmpleps xmm12, xmm2, xmm3
vpermilps xmm3, xmm4, 0
vcmpleps xmm3, xmm3, XMMWORD PTR [rsp-72]
vpand xmm3, xmm3, xmm12
vmovdqa xmm12, XMMWORD PTR .LC0[rip]
vpand xmm3, xmm3, xmm12
vpand xmm1, xmm1, xmm3
vmovdqa XMMWORD PTR [rsp-104], xmm1
vpermilps xmm1, xmm6, 85
vcmpleps xmm3, xmm1, xmm14
vpermilps xmm1, xmm11, 85
vcmpleps xmm1, xmm0, xmm1
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm5, 85
vcmpleps xmm3, xmm3, xmm13
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm10, 85
vcmpleps xmm3, xmm9, xmm3
vpand xmm1, xmm1, xmm3
vpshufd xmm3, xmm15, 85
vpermilps xmm15, xmm4, 85
vpcomltd xmm3, xmm8, xmm3
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm7, 85
vcmpleps xmm15, xmm15, XMMWORD PTR [rsp-72]
vcmpleps xmm3, xmm2, xmm3
vpand xmm3, xmm3, xmm15
vpermilps xmm15, xmm4, 170
vpand xmm3, xmm3, xmm12
vpermilps xmm4, xmm4, 255
vcmpleps xmm15, xmm15, XMMWORD PTR [rsp-72]
vpand xmm1, xmm1, xmm3
vcmpleps xmm4, xmm4, XMMWORD PTR [rsp-72]
vmovdqa XMMWORD PTR [rsp-120], xmm1
vpermilps xmm1, xmm6, 170
vpermilps xmm6, xmm6, 255
vcmpleps xmm3, xmm1, xmm14
vpermilps xmm1, xmm11, 170
vpermilps xmm11, xmm11, 255
vcmpleps xmm6, xmm6, xmm14
vcmpleps xmm1, xmm0, xmm1
vcmpleps xmm11, xmm0, xmm11
vpshufd xmm0, XMMWORD PTR [rsp-88], 255
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm5, 170
vpermilps xmm5, xmm5, 255
vcmpleps xmm3, xmm3, xmm13
vpand xmm6, xmm11, xmm6
vcmpleps xmm13, xmm5, xmm13
vmovdqa xmm5, XMMWORD PTR [rsp-104]
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm10, 170
vpermilps xmm10, xmm10, 255
vcmpleps xmm3, xmm9, xmm3
vpand xmm6, xmm6, xmm13
vmovdqu XMMWORD PTR [rax], xmm5
vcmpleps xmm9, xmm9, xmm10
vpand xmm1, xmm1, xmm3
vpshufd xmm3, XMMWORD PTR [rsp-88], 170
vpand xmm9, xmm6, xmm9
vpcomltd xmm3, xmm8, xmm3
vpand xmm1, xmm1, xmm3
vpcomltd xmm8, xmm8, xmm0
vmovdqa xmm0, XMMWORD PTR [rsp-120]
vpermilps xmm3, xmm7, 170
vpermilps xmm7, xmm7, 255
vcmpleps xmm3, xmm2, xmm3
vpand xmm8, xmm9, xmm8
vcmpleps xmm2, xmm2, xmm7
vmovdqu XMMWORD PTR [rax+16], xmm0
vpand xmm3, xmm3, xmm15
vpand xmm2, xmm2, xmm4
vpand xmm3, xmm3, xmm12
vpand xmm12, xmm2, xmm12
vpand xmm3, xmm1, xmm3
vpand xmm12, xmm8, xmm12
vmovdqu XMMWORD PTR [rax+32], xmm3
vmovdqu XMMWORD PTR [rax+48], xmm12
leave
ret
main: // character limit 30k
〜123行矢量说明。由于它的性能还不错,因此我尝试通过简单的比特操作简化它:
inline
const int intersectDim(const float minx, const float maxx, const float minx2, const float maxx2) noexcept
{
return !((maxx < minx2) || (maxx2 < minx));
}
inline
void comp4vs4( const int * const __restrict__ partId1, const int * const __restrict__ partId2,
const float * const __restrict__ minx1, const float * const __restrict__ minx2,
const float * const __restrict__ miny1, const float * const __restrict__ miny2,
const float * const __restrict__ minz1, const float * const __restrict__ minz2,
const float * const __restrict__ maxx1, const float * const __restrict__ maxx2,
const float * const __restrict__ maxy1, const float * const __restrict__ maxy2,
const float * const __restrict__ maxz1, const float * const __restrict__ maxz2,
int * const __restrict__ out
)
{
alignas(32)
int result[16]={
// 0v0 0v1 0v2 0v3
// 1v0 1v1 1v2 1v3
// 2v0 2v1 2v2 2v3
// 3v0 3v1 3v2 3v3
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0
};
for(int i=0;i<16;i++)
result[i] = partId1[i&3]<partId2[i/4];
for(int i=0;i<16;i++)
result[i] = result[i] &&
intersectDim(minx1[i&3], maxx1[i&3], minx2[i/4], maxx2[i/4]) &&
intersectDim(miny1[i&3], maxy1[i&3], miny2[i/4], maxy2[i/4]) &&
intersectDim(minz1[i&3], maxz1[i&3], minz2[i/4], maxz2[i/4]);
for(int i=0;i<16;i++)
out[i]=result[i];
}
#include<iostream>
int main()
{
int tile1[4];int tile2[4];
float tile3[4];float tile4[4];
float tile5[4];float tile6[4];
float tile7[4];float tile8[4];
float tile9[4];float tile10[4];
float tile11[4];float tile12[4];
float tile13[4];float tile14[4];
for(int i=0;i<4;i++)
{
std::cin>>tile1[i];
std::cin>>tile2[i];
std::cin>>tile3[i];
std::cin>>tile4[i];
std::cin>>tile5[i];
std::cin>>tile6[i];
std::cin>>tile7[i];
std::cin>>tile8[i];
std::cin>>tile9[i];
std::cin>>tile10[i];
std::cin>>tile11[i];
std::cin>>tile12[i];
std::cin>>tile13[i];
std::cin>>tile14[i];
}
int out[16];
comp4vs4(tile1,tile2,tile3,tile4,tile5,tile6,tile7,tile8,tile9,
tile10,tile11,tile12,tile13,tile14,out);
for(int i=0;i<16;i++)
std::cout<<out[i];
return 0;
}
Godbolt如何输出:
main:
// character limit 30k
vpxor xmm0, xmm0, xmm0
vmovdqa xmm3, XMMWORD PTR .LC0[rip]
lea rax, [rsp+240]
vpxor xmm4, xmm4, xmm4
vmovdqa XMMWORD PTR [rsp+224], xmm0
vmovdqa XMMWORD PTR [rsp+240], xmm0
vmovdqa XMMWORD PTR [rsp+256], xmm0
vmovdqa XMMWORD PTR [rsp+272], xmm0
vpcmpeqd xmm0, xmm0, xmm0
vmovdqa xmm7, xmm0
vmovdqa xmm6, xmm0
vmovdqa xmm5, xmm0
vpgatherdd xmm2, DWORD PTR [rsp+16+xmm4*4], xmm7
vmovdqa xmm4, XMMWORD PTR .LC1[rip]
vpgatherdd xmm1, DWORD PTR [rdx+xmm3*4], xmm6
vmovdqa xmm7, xmm0
vmovdqa xmm6, xmm0
vpcomltd xmm1, xmm1, xmm2
vpand xmm1, xmm1, xmm4
vmovdqa XMMWORD PTR [rsp+224], xmm1
vpgatherdd xmm1, DWORD PTR [rdx+xmm3*4], xmm6
vpgatherdd xmm2, DWORD PTR [rsp+16+xmm4*4], xmm7
vmovdqa xmm6, xmm0
vmovdqa xmm7, xmm0
vpcomltd xmm1, xmm1, xmm2
vpand xmm1, xmm1, xmm4
vmovdqa XMMWORD PTR [rsp+240], xmm1
vpgatherdd xmm1, DWORD PTR [rdx+xmm3*4], xmm5
vmovdqa xmm5, XMMWORD PTR .LC2[rip]
vpgatherdd xmm2, DWORD PTR [rsp+16+xmm5*4], xmm6
vmovdqa xmm5, XMMWORD PTR .LC3[rip]
vmovdqa xmm6, xmm0
vpcomltd xmm1, xmm1, xmm2
vpand xmm1, xmm1, xmm4
vmovdqa XMMWORD PTR [rsp+256], xmm1
vpgatherdd xmm1, DWORD PTR [rdx+xmm3*4], xmm7
vpgatherdd xmm0, DWORD PTR [rsp+16+xmm5*4], xmm6
vmovdqa xmm7, XMMWORD PTR .LC4[rip]
vpxor xmm6, xmm6, xmm6
lea rdx, [rsp+304]
vpcomltd xmm0, xmm1, xmm0
vpand xmm0, xmm0, xmm4
vmovdqa XMMWORD PTR [rsp+272], xmm0
.L3:
vmovdqa xmm0, XMMWORD PTR [rax-16]
vmovdqa xmm2, xmm3
prefetcht0 [rax]
add rax, 16
vpaddd xmm3, xmm3, xmm7
vpsrad xmm8, xmm2, 2
vpand xmm2, xmm2, xmm5
vpcomneqd xmm1, xmm0, xmm6
vmovaps xmm0, xmm1
vmovaps xmm11, xmm1
vmovaps xmm12, xmm1
vmovaps xmm13, xmm1
vgatherdps xmm11, DWORD PTR [rsp+144+xmm8*4], xmm0
vmovaps xmm14, xmm1
vmovaps xmm0, xmm1
vmovaps xmm10, xmm1
vmovaps xmm9, xmm1
vgatherdps xmm10, DWORD PTR [rsp+128+xmm2*4], xmm13
vgatherdps xmm0, DWORD PTR [r13+0+xmm8*4], xmm12
vgatherdps xmm9, DWORD PTR [rsp+32+xmm2*4], xmm14
vcmpleps xmm0, xmm0, xmm10
vcmpleps xmm9, xmm9, xmm11
vpand xmm0, xmm0, xmm9
vpand xmm1, xmm0, xmm1
vmovaps xmm0, xmm1
vmovaps xmm11, xmm1
vmovaps xmm15, xmm1
vmovaps xmm10, xmm1
vgatherdps xmm11, DWORD PTR [r15+xmm8*4], xmm0
vmovaps xmm12, xmm1
vmovaps xmm0, xmm1
vmovaps xmm9, xmm1
vmovaps xmm13, xmm1
vgatherdps xmm10, DWORD PTR [r12+xmm2*4], xmm12
vgatherdps xmm0, DWORD PTR [rsp+80+xmm8*4], xmm15
vgatherdps xmm9, DWORD PTR [rsp+64+xmm2*4], xmm13
vcmpleps xmm0, xmm0, xmm10
vcmpleps xmm9, xmm9, xmm11
vpand xmm0, xmm0, xmm9
vpand xmm0, xmm0, xmm1
vmovaps xmm1, xmm0
vmovaps xmm10, xmm0
vmovaps xmm9, xmm0
vmovaps xmm14, xmm0
vgatherdps xmm10, DWORD PTR [rsp+208+xmm8*4], xmm1
vmovaps xmm1, xmm0
vgatherdps xmm9, DWORD PTR [r14+xmm8*4], xmm1
vmovaps xmm1, xmm0
vmovaps xmm8, xmm0
vgatherdps xmm8, DWORD PTR [rsp+192+xmm2*4], xmm1
vmovaps xmm1, xmm0
vgatherdps xmm1, DWORD PTR [rsp+96+xmm2*4], xmm14
vcmpleps xmm2, xmm9, xmm8
vcmpleps xmm1, xmm1, xmm10
vpand xmm1, xmm1, xmm2
vpand xmm1, xmm1, xmm4
vpand xmm0, xmm0, xmm1
vmovdqa XMMWORD PTR [rax-32], xmm0
cmp rdx, rax
jne .L3
vmovdqa xmm5, XMMWORD PTR [rsp+224]
vmovdqa xmm7, XMMWORD PTR [rsp+240]
vmovdqa xmm4, XMMWORD PTR [rsp+256]
lea rbx, [rsp+288]
lea r12, [rsp+352]
vmovdqa XMMWORD PTR [rsp+288], xmm5
vmovdqa xmm5, XMMWORD PTR [rsp+272]
vmovdqa XMMWORD PTR [rsp+304], xmm7
vmovdqa XMMWORD PTR [rsp+320], xmm4
vmovdqa XMMWORD PTR [rsp+336], xmm5
.L4:
// character limit 30k
它具有约110行矢量说明。尽管说明的说明少于第一版,但它的运行速度为一半(至少在BDVER1编译器标志上)。是因为索引的“和”和部门操作?
同样,使用限制关键字的参数偶尔指向相同的内存。这可能是性能的问题吗?
如果有所帮助,则使用AVX512-CPU(每个叶子节点最多32个AABBS)在某些在线服务上的性能测试源代码:
每叶节点128 AABBS(在Godbolt服务器中进行测试)时的性能差异要少一些
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
data:image/s3,"s3://crabby-images/d5906/d59060df4059a6cc364216c4d63ceec29ef7fe66" alt="扫码二维码加入Web技术交流群"
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论