按位异或运算和 popcount 的 AVX 性能较慢
我刚开始编写一些基于 avx 内在函数的代码,因此需要一些帮助来理解我的观察结果是否符合预期。我有两种实现距离计算的方法,这两种方法都采用 2 个浮点数组及其维度并返回浮点距离。第一种方法计算欧氏距离
static float
compute_l2Square(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
float *pVect1 = (float *) pVect1v;
float *pVect2 = (float *) pVect2v;
size_t qty = *((size_t *) qty_ptr);
float __attribute__((aligned(32))) TmpRes[8];
size_t qty16 = qty >> 4;
const float *pEnd1 = pVect1 + (qty16 << 4);
__m256 diff, v1, v2;
__m256 sum = _mm256_set1_ps(0);
while (pVect1 < pEnd1) {
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
}
_mm256_store_ps(TmpRes, sum);
return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
}
第二种方法计算按位异或,然后计算 1 的数量,即汉明距离
static float compute_hamming(const void* __restrict pVect1v,
const void* __restrict pVect2v,
const void* __restrict qty_ptr) {
float *pVect1 = (float *) pVect1v;
float *pVect2 = (float *) pVect2v;
size_t qty = *((size_t *)qty_ptr);
uint64_t __attribute__((aligned(32))) TmpRes[4];
size_t qty16 = qty >> 4;
const float *pEnd1 = pVect1 + (qty16 << 4);
int res = 0;
__m256 diff, v1, v2;
while (pVect1 < pEnd1) {
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_xor_ps(v1, v2);
_mm256_store_si256( (__m256i*)TmpRes, _mm256_castps_si256(diff));
res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
+ __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_xor_ps(v1, v2);
_mm256_store_si256( (__m256i*)TmpRes, _mm256_castps_si256(diff));
res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
+ __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);
}
return res;
}
对于相同的位数,l2 平方距离计算比汉明快得多,即几乎 2x-4x 9 (即计算 l2 距离对于 512 位,16 个浮点数比在 16 个浮点数上计算汉明要快)。我不太确定这是否是预期的。 在我看来,popcount 和将结果存储到 temp 会导致一些缓慢,因为当我修改 l2 距离计算以进行异或操作而不是 sub ie 时,将 _mm256_sub_ps
替换为 _mm256_xor_ps
> l2 计算变得更快。
我正在 Mac 操作系统上进行基准测试,它具有 avx 指令支持。另一个观察结果是仅使用循环的汉明距离的非 avx 实现: sum += popcount(vec_a[i] ^ vec_b[i]) 也给出了与 avx 实现类似的数字。我还检查了是否调用 avx 指令和方法只是为了进行健全性检查。
非矢量化实现:
static float compute_hamming(const void* __restrict pVect1,
const void* __restrict pVect2,
const void* __restrict qty_ptr) {
size_t qty = *((size_t *)qty_ptr);
int res = 0;
const float *pVect1LL = (const float *)pVect1;
const float *pVect2LL = (const float *)pVect2;
for (unsigned i = 0; i < qty; i = i + 2) {
if (i + 1 == qty) {
unsigned int v1;
unsigned int v2;
memcpy(&v1, &pVect1LL[i], sizeof(float));
memcpy(&v2, &pVect2LL[i], sizeof(float));
res += __builtin_popcount(v1 ^ v2);
break;
}
uint64_t v1;
uint64_t v2;
memcpy(&v1, &pVect1LL[i], sizeof(float) * 2);
memcpy(&v2, &pVect2LL[i], sizeof(float) * 2);
res += __builtin_popcountll(v1 ^ v2);
}
return res;
}
由于距离计算方法是瓶颈,因此需要一些帮助和建议来提高性能。
I am new to writing some avx intrinsics based code so need some help in understanding if my observations are expected. I have 2 methods implementing distance computations, both methods take 2 float arrays and its dimension and returns a float distance. The first method computes a euclidean distance
static float
compute_l2Square(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
float *pVect1 = (float *) pVect1v;
float *pVect2 = (float *) pVect2v;
size_t qty = *((size_t *) qty_ptr);
float __attribute__((aligned(32))) TmpRes[8];
size_t qty16 = qty >> 4;
const float *pEnd1 = pVect1 + (qty16 << 4);
__m256 diff, v1, v2;
__m256 sum = _mm256_set1_ps(0);
while (pVect1 < pEnd1) {
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
}
_mm256_store_ps(TmpRes, sum);
return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
}
The second method computes a bitwise xor and then counts number of 1 i.e hamming distance
static float compute_hamming(const void* __restrict pVect1v,
const void* __restrict pVect2v,
const void* __restrict qty_ptr) {
float *pVect1 = (float *) pVect1v;
float *pVect2 = (float *) pVect2v;
size_t qty = *((size_t *)qty_ptr);
uint64_t __attribute__((aligned(32))) TmpRes[4];
size_t qty16 = qty >> 4;
const float *pEnd1 = pVect1 + (qty16 << 4);
int res = 0;
__m256 diff, v1, v2;
while (pVect1 < pEnd1) {
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_xor_ps(v1, v2);
_mm256_store_si256( (__m256i*)TmpRes, _mm256_castps_si256(diff));
res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
+ __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_xor_ps(v1, v2);
_mm256_store_si256( (__m256i*)TmpRes, _mm256_castps_si256(diff));
res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
+ __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);
}
return res;
}
For the same number of bits, l2 square distance computation is much faster than hamming i.e almost 2x-4x 9 ( i.e computing l2 distance for 512 bits which 16 floats is faster than computing hamming on the 16 floats) . I am not really sure if this is expected .
To me it seems that popcount and storing the results to temp is causing some slowness , because when i modify the l2 distance computation to do xor operation instead of sub i.e replace _mm256_sub_ps
with _mm256_xor_ps
the l2 computation becomes more fast.
I am benchmarking on a mac os, which has avx instruction support. Also another observation is a non avx implementation of hamming distance using just loop : sum += popcount(vec_a[i] ^ vec_b[i]) is also giving similar numbers as avx implementation . I also checked that avx instructions and methods are invoked just for sanity checks.
The non vectorized implementation :
static float compute_hamming(const void* __restrict pVect1,
const void* __restrict pVect2,
const void* __restrict qty_ptr) {
size_t qty = *((size_t *)qty_ptr);
int res = 0;
const float *pVect1LL = (const float *)pVect1;
const float *pVect2LL = (const float *)pVect2;
for (unsigned i = 0; i < qty; i = i + 2) {
if (i + 1 == qty) {
unsigned int v1;
unsigned int v2;
memcpy(&v1, &pVect1LL[i], sizeof(float));
memcpy(&v2, &pVect2LL[i], sizeof(float));
res += __builtin_popcount(v1 ^ v2);
break;
}
uint64_t v1;
uint64_t v2;
memcpy(&v1, &pVect1LL[i], sizeof(float) * 2);
memcpy(&v2, &pVect2LL[i], sizeof(float) * 2);
res += __builtin_popcountll(v1 ^ v2);
}
return res;
}
Need some help and recommendations on improving the performance since the bottleneck is distance computation method.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
如果您的目标是 Haswell 及更新版本,则可以使用
_mm256_fmadd_ps
进一步加快l2Square
版本的速度。 (还有 Piledriver,除非您使用的是 Mac,并且您可能不关心 AMD Hackintosh 机器。)同样或更重要的是,通过使用两个单独的 __m256 sum0, sum1 累加器来隐藏 FP 延迟,最后将它们加在一起,然后再减少。 (使用高效的hsum< /a>,不仅仅是存储然后依次对每个元素进行标量相加。)
__m256
。没有硬件 SIMD popcount (AVX512 VPOPCOUNTDQ), 是的,当然它会变慢,特别是如果编译器没有对每个元素进行矢量化使用半字节 LUT 或其他内容 (
vpshufb
) 将__builtin_popcountll(vec[0]) + ...
转换为 SIMD popcount。您这样做的方式实际上使 clang 的情况变得更糟,方法是让它执行 SIMD XOR,然后实际提取到标量,而不是首先使用标量 XOR 和 popcnt;请注意 asm.h 文件中的
vpextrq
指令。 Clang 可以在循环中自动矢量化__builtin_popcountll
(以一种不算可怕但不是很好的方式),但不是这样。(实际上,SIMD XOR,然后标量提取popcnt
并不像我想象的那么糟糕,但前提是您使用 128 位向量;请参阅 Wojciech Mula 的 git 存储库中的“sse-cpu”结果;链接如下,即使是纯负载的 SSE 也不会减慢太多速度。)例如,clang 使用循环内的 YMM 向量自动对其进行向量化。 (Godbolt 显示了这个和你的代码)不幸的是,它对
char*
数组以及使用unsigned
而不是的效果不佳>unsigned long
它只使用 XMM 向量。使用
memcpy
从char*
进行未对齐的别名安全加载似乎也会失败矢量化,或者使用标量加载和xor
的一些变体;您可能需要typdef uint64_t aliasing_unaligned_u64 __attribute__((aligned(4), may_alias))。 (我使用aligned(4)
是假设您将其指向对齐的浮点。)但是,最好的选择是手动矢量化 SIMD popcount。 请参阅 https://github.com/WojciechMula/sse-popcount/。这也避免了对类型进行任何复杂的操作,以创建严格别名安全的代码,该代码将在
float
数据数组上很好地自动矢量化。对于大量计数,在溢出之前,可能比仅使用
vpshufb ymm
/ 垂直求和内循环 /vpsadbw
将 hsum 转换为 qwords 的良好实现更快。例如,对于大小为 4096 字节的数组,该存储库中的 Harley Seal SIMD popcount 代码在 Skylake 上比同一存储库中的最佳“avx-lookup”实现快约 20%。 (速度是“avx2-lookup-original”的两倍;我忘记了区别是什么。)请参阅 Skylake 上 clang 的结果更改
popcnt_AVX2_lookup
获取两个指针,_mm256_xor_si256
很简单,只需将__m256i vec = _mm256_loadu
替换为这对语句即可。或者,如果您的阵列足够大,可以对 Harley-Seal 进行同样的操作;它不应该导致任何额外的寄存器压力,因为它可以编译为加载/内存源vpxor。还可以调整其展开因子以适合您的典型问题大小。
由于小尺寸显然对于您的用例来说很常见(我最初没有意识到):
您的实际用例需要考虑的另一件事是您出现奇怪尺寸的频率。如果 AVX2-lookup 仅适用于展开因子的倍数,并且需要展开才能跟上,那么您最终可能会导致大量输入在其后备路径中花费大量时间。因此,要么提高效率很重要,要么成为放弃它并仅使用 SSE2 XOR + 标量 popcnt 的好理由,它可以轻松地实现 16 字节粒度而不会出现问题。
You could speed up your
l2Square
version more by using_mm256_fmadd_ps
, if you're targeting Haswell and newer. (And Piledriver, except you're on a Mac and you probably don't care about AMD Hackintosh machines.)Equally or more importantly, by using two separate
__m256 sum0, sum1
accumulators to hide FP latency, adding them together at the end before reducing. (With an efficient hsum, not just store and then scalar add of each element in turn.)__m256
specifically.Without hardware SIMD popcount (AVX512 VPOPCOUNTDQ), yes of course it's going to be slower, especially if the compiler doesn't vectorize those per-element
__builtin_popcountll(vec[0]) + ...
into SIMD popcount using a nibble LUT or something (vpshufb
).The way you're doing it is actually making things worse for clang, by getting it to do SIMD XOR but then actually extract to scalar instead of just using scalar XOR and popcnt in the first place; note the
vpextrq
instructions in the asm. Clang can auto-vectorize__builtin_popcountll
in a loop (in a not-terrible but not great way), but not like this. (Actually, SIMD XOR and then scalar extract forpopcnt
is not nearly as bad as I thought, but only if you use 128-bit vectors; see the "sse-cpu" results from Wojciech Mula's git repo linked below where even SSE for pure loads doesn't slow it down much.)For example, clang auto-vectorizes this with YMM vectors inside the loop. (Godbolt showing this and your code) Unfortunately it does a bad job with
char*
arrays, and withunsigned
instead ofunsigned long
it only uses XMM vectors.Using
memcpy
for unaligned aliasing-safe loads fromchar*
also seemed to defeat vectorization, or some variation on this used scalar load andxor
; you may needtypdef uint64_t aliasing_unaligned_u64 __attribute__((aligned(4), may_alias))
. (I usedaligned(4)
on the assumption you're pointing it at aligned floats.)However, your best bet is to manually vectorize the SIMD popcount. See https://github.com/WojciechMula/sse-popcount/. That also avoids any futzing with types to make strict-aliasing-safe code that will auto-vectorize nicely over arrays of
float
data.For large counts, it's possible to go even faster than a good implementation of using just
vpshufb ymm
/ vertical sum inner loop /vpsadbw
to hsum to qwords before it can overflow. For example, the Harley Seal SIMD popcount code in that repo is about 20% faster on Skylake than the best "avx-lookup" implementation from the same repo, for arrays of size 4096 bytes. (And twice as fast as "avx2-lookup-original"; I forget what the difference was.) See results for clang on SkylakeChanging
popcnt_AVX2_lookup
to take two pointers and_mm256_xor_si256
is trivial, just replace the__m256i vec = _mm256_loadu
with those couple statements. Or do the same with Harley-Seal if your arrays are large enough to warrant it; it shouldn't cause any extra register pressure since it can compile to a load / memory-source-vpxor.Also tweak its unroll factor to be good with your typical problem sizes.
Since small size is apparently common for your use-case (which I didn't realize originally):
Another thing to consider with your real use case is how frequently you'll have odd sizes. If AVX2-lookup is only good with a multiple of the unroll factor, and needs unrolling to keep up, you might end up with a lot of your inputs spending a lot of time in its fallback path. So it would either be important to make that efficient, or be a good reason to drop it and just use SSE2 XOR + scalar popcnt which can easily do 16-byte granularity without a problem.
是的,您的观察是预期的。您的欧几里得代码或多或少都可以,但是您的汉明代码效率非常低。
既然您提到了 AVX1 而没有提到 AVX2,我假设您没有 AVX2。在这种情况下,我会这样做,未经测试。
Yeah, your observations are expected. Your code for Euclidean is more or less OK, but your code for Hamming is very inefficient.
Since you mentioned AVX1 but not AVX2, I assume you don’t have AVX2. In that case, I would do it like that, untested.