按位异或运算和 popcount 的 AVX 性能较慢

发布于 2025-01-09 05:44:38 字数 4052 浏览 1 评论 0原文

我刚开始编写一些基于 avx 内在函数的代码，因此需要一些帮助来理解我的观察结果是否符合预期。我有两种实现距离计算的方法，这两种方法都采用 2 个浮点数组及其维度并返回浮点距离。第一种方法计算欧氏距离

   static float
    compute_l2Square(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
        float *pVect1 = (float *) pVect1v;
        float *pVect2 = (float *) pVect2v;
        size_t qty = *((size_t *) qty_ptr);
        float __attribute__((aligned(32))) TmpRes[8];
        size_t qty16 = qty >> 4;

        const float *pEnd1 = pVect1 + (qty16 << 4);

        __m256 diff, v1, v2;
        __m256 sum = _mm256_set1_ps(0);

        while (pVect1 < pEnd1) {
            v1 = _mm256_loadu_ps(pVect1);
            pVect1 += 8;
            v2 = _mm256_loadu_ps(pVect2);
            pVect2 += 8;
            diff = _mm256_sub_ps(v1, v2);
            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));

            v1 = _mm256_loadu_ps(pVect1);
            pVect1 += 8;
            v2 = _mm256_loadu_ps(pVect2);
            pVect2 += 8;
            diff = _mm256_sub_ps(v1, v2);
            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
        }

        _mm256_store_ps(TmpRes, sum);
        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
    }

第二种方法计算按位异或，然后计算 1 的数量，即汉明距离

static float compute_hamming(const void* __restrict pVect1v,
                     const void* __restrict pVect2v,
                     const void* __restrict qty_ptr) {
   float *pVect1 = (float *) pVect1v;
   float *pVect2 = (float *) pVect2v;
   size_t qty = *((size_t *)qty_ptr);
   uint64_t __attribute__((aligned(32))) TmpRes[4];
   size_t qty16 = qty >> 4;

   const float *pEnd1 = pVect1 + (qty16 << 4);
   int res = 0;
   __m256 diff, v1, v2;
    while (pVect1 < pEnd1) {
              v1 = _mm256_loadu_ps(pVect1);
              pVect1 += 8;
              v2 = _mm256_loadu_ps(pVect2);
              pVect2 += 8;
              diff = _mm256_xor_ps(v1, v2);
              _mm256_store_si256( (__m256i*)TmpRes,  _mm256_castps_si256(diff));
              res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
              + __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);

              v1 = _mm256_loadu_ps(pVect1);
              pVect1 += 8;
              v2 = _mm256_loadu_ps(pVect2);
              pVect2 += 8;
              diff = _mm256_xor_ps(v1, v2);
              _mm256_store_si256( (__m256i*)TmpRes,  _mm256_castps_si256(diff));
              res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
                            + __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);
          }
  return res;
    }

对于相同的位数，l2 平方距离计算比汉明快得多，即几乎 2x-4x 9 （即计算 l2 距离对于 512 位，16 个浮点数比在 16 个浮点数上计算汉明要快）。我不太确定这是否是预期的。在我看来，popcount 和将结果存储到 temp 会导致一些缓慢，因为当我修改 l2 距离计算以进行异或操作而不是 sub ie 时，将 _mm256_sub_ps 替换为 _mm256_xor_ps > l2 计算变得更快。

我正在 Mac 操作系统上进行基准测试，它具有 avx 指令支持。另一个观察结果是仅使用循环的汉明距离的非 avx 实现： sum += popcount(vec_a[i] ^ vec_b[i]) 也给出了与 avx 实现类似的数字。我还检查了是否调用 avx 指令和方法只是为了进行健全性检查。

非矢量化实现：

static float compute_hamming(const void* __restrict pVect1,
                     const void* __restrict pVect2,
                     const void* __restrict qty_ptr) {
  size_t qty = *((size_t *)qty_ptr);
  int res = 0;


  const float *pVect1LL = (const float *)pVect1;
  const float *pVect2LL = (const float *)pVect2;
  for (unsigned i = 0; i < qty; i = i + 2) {
    if (i + 1 == qty) {
      unsigned int v1;
      unsigned int v2;
      memcpy(&v1, &pVect1LL[i], sizeof(float));
      memcpy(&v2, &pVect2LL[i], sizeof(float));
      res += __builtin_popcount(v1 ^ v2);
      break;
    }
    uint64_t v1;
    uint64_t v2;
    memcpy(&v1, &pVect1LL[i], sizeof(float) * 2);
    memcpy(&v2, &pVect2LL[i], sizeof(float) * 2);

    res += __builtin_popcountll(v1 ^ v2);
  }

  return res;
}

由于距离计算方法是瓶颈，因此需要一些帮助和建议来提高性能。

原文

I am new to writing some avx intrinsics based code so need some help in understanding if my observations are expected. I have 2 methods implementing distance computations, both methods take 2 float arrays and its dimension and returns a float distance. The first method computes a euclidean distance

   static float
    compute_l2Square(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
        float *pVect1 = (float *) pVect1v;
        float *pVect2 = (float *) pVect2v;
        size_t qty = *((size_t *) qty_ptr);
        float __attribute__((aligned(32))) TmpRes[8];
        size_t qty16 = qty >> 4;

        const float *pEnd1 = pVect1 + (qty16 << 4);

        __m256 diff, v1, v2;
        __m256 sum = _mm256_set1_ps(0);

        while (pVect1 < pEnd1) {
            v1 = _mm256_loadu_ps(pVect1);
            pVect1 += 8;
            v2 = _mm256_loadu_ps(pVect2);
            pVect2 += 8;
            diff = _mm256_sub_ps(v1, v2);
            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));

            v1 = _mm256_loadu_ps(pVect1);
            pVect1 += 8;
            v2 = _mm256_loadu_ps(pVect2);
            pVect2 += 8;
            diff = _mm256_sub_ps(v1, v2);
            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
        }

        _mm256_store_ps(TmpRes, sum);
        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
    }

The second method computes a bitwise xor and then counts number of 1 i.e hamming distance

static float compute_hamming(const void* __restrict pVect1v,
                     const void* __restrict pVect2v,
                     const void* __restrict qty_ptr) {
   float *pVect1 = (float *) pVect1v;
   float *pVect2 = (float *) pVect2v;
   size_t qty = *((size_t *)qty_ptr);
   uint64_t __attribute__((aligned(32))) TmpRes[4];
   size_t qty16 = qty >> 4;

   const float *pEnd1 = pVect1 + (qty16 << 4);
   int res = 0;
   __m256 diff, v1, v2;
    while (pVect1 < pEnd1) {
              v1 = _mm256_loadu_ps(pVect1);
              pVect1 += 8;
              v2 = _mm256_loadu_ps(pVect2);
              pVect2 += 8;
              diff = _mm256_xor_ps(v1, v2);
              _mm256_store_si256( (__m256i*)TmpRes,  _mm256_castps_si256(diff));
              res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
              + __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);

              v1 = _mm256_loadu_ps(pVect1);
              pVect1 += 8;
              v2 = _mm256_loadu_ps(pVect2);
              pVect2 += 8;
              diff = _mm256_xor_ps(v1, v2);
              _mm256_store_si256( (__m256i*)TmpRes,  _mm256_castps_si256(diff));
              res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
                            + __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);
          }
  return res;
    }

For the same number of bits, l2 square distance computation is much faster than hamming i.e almost 2x-4x 9 ( i.e computing l2 distance for 512 bits which 16 floats is faster than computing hamming on the 16 floats) . I am not really sure if this is expected .
To me it seems that popcount and storing the results to temp is causing some slowness , because when i modify the l2 distance computation to do xor operation instead of sub i.e replace _mm256_sub_ps with _mm256_xor_ps the l2 computation becomes more fast.

I am benchmarking on a mac os, which has avx instruction support. Also another observation is a non avx implementation of hamming distance using just loop : sum += popcount(vec_a[i] ^ vec_b[i]) is also giving similar numbers as avx implementation . I also checked that avx instructions and methods are invoked just for sanity checks.

The non vectorized implementation :

static float compute_hamming(const void* __restrict pVect1,
                     const void* __restrict pVect2,
                     const void* __restrict qty_ptr) {
  size_t qty = *((size_t *)qty_ptr);
  int res = 0;


  const float *pVect1LL = (const float *)pVect1;
  const float *pVect2LL = (const float *)pVect2;
  for (unsigned i = 0; i < qty; i = i + 2) {
    if (i + 1 == qty) {
      unsigned int v1;
      unsigned int v2;
      memcpy(&v1, &pVect1LL[i], sizeof(float));
      memcpy(&v2, &pVect2LL[i], sizeof(float));
      res += __builtin_popcount(v1 ^ v2);
      break;
    }
    uint64_t v1;
    uint64_t v2;
    memcpy(&v1, &pVect1LL[i], sizeof(float) * 2);
    memcpy(&v2, &pVect2LL[i], sizeof(float) * 2);

    res += __builtin_popcountll(v1 ^ v2);
  }

  return res;
}

Need some help and recommendations on improving the performance since the bottleneck is distance computation method.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

萌逼全场 2025-01-16 05:44:38

如果您的目标是 Haswell 及更新版本，则可以使用 _mm256_fmadd_ps 进一步加快 l2Square 版本的速度。（还有 Piledriver，除非您使用的是 Mac，并且您可能不关心 AMD Hackintosh 机器。）

同样或更重要的是，通过使用两个单独的 __m256 sum0, sum1 累加器来隐藏 FP 延迟，最后将它们加在一起，然后再减少。（使用高效的hsum< /a>，不仅仅是存储然后依次对每个元素进行标量相加。）

如何水平求和 __m256？特别针对 __m256 。
为什么mulss只需要3 Haswell 上的周期与 Agner 的指令表不同吗？（使用多个累加器展开 FP 循环） - 有关使用多个累加器隐藏 FP 延迟的更多信息：对于大型问题规模，越多越好。对于非常低的迭代计数（小数组），无序执行可以隐藏大量延迟。

没有硬件 SIMD popcount (AVX512 VPOPCOUNTDQ)， 是的，当然它会变慢，特别是如果编译器没有对每个元素进行矢量化使用半字节 LUT 或其他内容 (vpshufb) 将 __builtin_popcountll(vec[0]) + ... 转换为 SIMD popcount。

您这样做的方式实际上使 clang 的情况变得更糟，方法是让它执行 SIMD XOR，然后实际提取到标量，而不是首先使用标量 XOR 和 popcnt；请注意 asm.h 文件中的 vpextrq 指令。 Clang 可以在循环中自动矢量化__builtin_popcountll（以一种不算可怕但不是很好的方式），但不是这样。（实际上，SIMD XOR，然后标量提取popcnt 并不像我想象的那么糟糕，但前提是您使用 128 位向量；请参阅 Wojciech Mula 的 git 存储库中的“sse-cpu”结果；链接如下，即使是纯负载的 SSE 也不会减慢太多速度。）

例如，clang 使用循环内的 YMM 向量自动对其进行向量化。 (Godbolt 显示了这个和你的代码）不幸的是，它对 char* 数组以及使用 unsigned 而不是 的效果不佳>unsigned long 它只使用 XMM 向量。

float compute_hamming_autovec(const unsigned long* __restrict a, 
                              const unsigned long* __restrict b,
                              size_t qty)    // by value to keep it simpler, IDK why you'd want to pass this by reference with a void*
{
    //const unsigned char *__restrict a = pVect1v, *__restrict b = pVect2v;
    unsigned long sum = 0;
    for (size_t i=0 ; i<qty*4 ; i++){
        unsigned long tmp1=a[i], tmp2=b[i];
        //memcpy(&tmp1, a+i, 4);
        //memcpy(&tmp2, b+i, 4);
        sum += __builtin_popcountll(tmp1 ^ tmp2);
    }
    return sum;   // why are we returning this integer count as a float?  IDK
}

使用 memcpy 从 char* 进行未对齐的别名安全加载似乎也会失败矢量化，或者使用标量加载和 xor 的一些变体；您可能需要typdef uint64_t aliasing_unaligned_u64 __attribute__((aligned(4), may_alias))。（我使用 aligned(4) 是假设您将其指向对齐的浮点。）

但是，最好的选择是手动矢量化 SIMD popcount。 请参阅 https://github.com/WojciechMula/sse-popcount/。这也避免了对类型进行任何复杂的操作，以创建严格别名安全的代码，该代码将在 float 数据数组上很好地自动矢量化。

对于大量计数，在溢出之前，可能比仅使用 vpshufb ymm / 垂直求和内循环 / vpsadbw 将 hsum 转换为 qwords 的良好实现更快。例如，对于大小为 4096 字节的数组，该存储库中的 Harley Seal SIMD popcount 代码在 Skylake 上比同一存储库中的最佳“avx-lookup”实现快约 20%。（速度是“avx2-lookup-original”的两倍；我忘记了区别是什么。）请参阅 Skylake 上 clang 的结果

更改popcnt_AVX2_lookup 获取两个指针，_mm256_xor_si256 很简单，只需将 __m256i vec = _mm256_loadu 替换为这对语句即可。或者，如果您的阵列足够大，可以对 Harley-Seal 进行同样的操作；它不应该导致任何额外的寄存器压力，因为它可以编译为加载/内存源vpxor。

还可以调整其展开因子以适合您的典型问题大小。

由于小尺寸显然对于您的用例来说很常见（我最初没有意识到）：

您的实际用例需要考虑的另一件事是您出现奇怪尺寸的频率。如果 AVX2-lookup 仅适用于展开因子的倍数，并且需要展开才能跟上，那么您最终可能会导致大量输入在其后备路径中花费大量时间。因此，要么提高效率很重要，要么成为放弃它并仅使用 SSE2 XOR + 标量 popcnt 的好理由，它可以轻松地实现 16 字节粒度而不会出现问题。

You could speed up your l2Square version more by using _mm256_fmadd_ps, if you're targeting Haswell and newer. (And Piledriver, except you're on a Mac and you probably don't care about AMD Hackintosh machines.)

Equally or more importantly, by using two separate __m256 sum0, sum1 accumulators to hide FP latency, adding them together at the end before reducing. (With an efficient hsum, not just store and then scalar add of each element in turn.)

How to sum __m256 horizontally? for __m256 specifically.
Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? (Unrolling FP loops with multiple accumulators) - more about hiding FP latency with multiple accumulators: for large problem sizes, more would be better. For very low iteration counts (small arrays), out-of-order exec can hide a lot of latency.

Without hardware SIMD popcount (AVX512 VPOPCOUNTDQ), yes of course it's going to be slower, especially if the compiler doesn't vectorize those per-element __builtin_popcountll(vec[0]) + ... into SIMD popcount using a nibble LUT or something (vpshufb).

The way you're doing it is actually making things worse for clang, by getting it to do SIMD XOR but then actually extract to scalar instead of just using scalar XOR and popcnt in the first place; note the vpextrq instructions in the asm. Clang can auto-vectorize __builtin_popcountll in a loop (in a not-terrible but not great way), but not like this. (Actually, SIMD XOR and then scalar extract for popcnt is not nearly as bad as I thought, but only if you use 128-bit vectors; see the "sse-cpu" results from Wojciech Mula's git repo linked below where even SSE for pure loads doesn't slow it down much.)

For example, clang auto-vectorizes this with YMM vectors inside the loop. (Godbolt showing this and your code) Unfortunately it does a bad job with char* arrays, and with unsigned instead of unsigned long it only uses XMM vectors.

float compute_hamming_autovec(const unsigned long* __restrict a, 
                              const unsigned long* __restrict b,
                              size_t qty)    // by value to keep it simpler, IDK why you'd want to pass this by reference with a void*
{
    //const unsigned char *__restrict a = pVect1v, *__restrict b = pVect2v;
    unsigned long sum = 0;
    for (size_t i=0 ; i<qty*4 ; i++){
        unsigned long tmp1=a[i], tmp2=b[i];
        //memcpy(&tmp1, a+i, 4);
        //memcpy(&tmp2, b+i, 4);
        sum += __builtin_popcountll(tmp1 ^ tmp2);
    }
    return sum;   // why are we returning this integer count as a float?  IDK
}

Using memcpy for unaligned aliasing-safe loads from char* also seemed to defeat vectorization, or some variation on this used scalar load and xor; you may need typdef uint64_t aliasing_unaligned_u64 __attribute__((aligned(4), may_alias)). (I used aligned(4) on the assumption you're pointing it at aligned floats.)

However, your best bet is to manually vectorize the SIMD popcount. See https://github.com/WojciechMula/sse-popcount/. That also avoids any futzing with types to make strict-aliasing-safe code that will auto-vectorize nicely over arrays of float data.

For large counts, it's possible to go even faster than a good implementation of using just vpshufb ymm / vertical sum inner loop / vpsadbw to hsum to qwords before it can overflow. For example, the Harley Seal SIMD popcount code in that repo is about 20% faster on Skylake than the best "avx-lookup" implementation from the same repo, for arrays of size 4096 bytes. (And twice as fast as "avx2-lookup-original"; I forget what the difference was.) See results for clang on Skylake

Changing popcnt_AVX2_lookup to take two pointers and _mm256_xor_si256 is trivial, just replace the __m256i vec = _mm256_loadu with those couple statements. Or do the same with Harley-Seal if your arrays are large enough to warrant it; it shouldn't cause any extra register pressure since it can compile to a load / memory-source-vpxor.

Also tweak its unroll factor to be good with your typical problem sizes.

Since small size is apparently common for your use-case (which I didn't realize originally):

Another thing to consider with your real use case is how frequently you'll have odd sizes. If AVX2-lookup is only good with a multiple of the unroll factor, and needs unrolling to keep up, you might end up with a lot of your inputs spending a lot of time in its fallback path. So it would either be important to make that efficient, or be a good reason to drop it and just use SSE2 XOR + scalar popcnt which can easily do 16-byte granularity without a problem.

回复收藏 0 原文

灵芸 2025-01-16 05:44:38

是的，您的观察是预期的。您的欧几里得代码或多或少都可以，但是您的汉明代码效率非常低。

既然您提到了 AVX1 而没有提到 AVX2，我假设您没有 AVX2。在这种情况下，我会这样做，未经测试。

// Count set bits in every byte,
// add slices of 8 bytes together into a vector of two int64 lanes.
inline __m128i popcntSse( __m128i vec )
{
    // Isolate low and high 4 bit pieces from each byte
    const __m128i lowMask = _mm_set1_epi8( 0xF );
    __m128i a = _mm_and_si128( lowMask, vec );
    __m128i b = _mm_andnot_si128( lowMask, vec );
    b = _mm_srli_epi32( b, 4 );

    // Apply the lookup table
    const __m128i lookup = _mm_setr_epi8(
    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
    );
    a = _mm_shuffle_epi8( lookup, a );
    b = _mm_shuffle_epi8( lookup, b );

    // Add two pieces together; adding 2 numbers [ 0 .. 4 ] each, not gonna overflow
    __m128i res = _mm_add_epi8( a, b );

    // Return horizontal sum of bytes
    return _mm_sad_epu8( res, _mm_setzero_si128() );
}

static float computeHammingDistance( const float* p1, const float* p2, size_t count )
{
    const float* const p1EndAligned = p1 + ( count / 4 ) * 4;
    const size_t remainder = ( count % 4 );

    __m128i acc = _mm_setzero_si128();
    // Process most of these values doing 4 of them per iteration
    while( p1 < p1EndAligned )
    {
        __m128i v1 = _mm_loadu_si128( ( const __m128i* )p1 );
        __m128i v2 = _mm_loadu_si128( ( const __m128i* )p2 );
        p1 += 4;
        p2 += 4;
        __m128i xx = _mm_xor_si128( v1, v2 );
        acc = _mm_add_epi64( acc, popcntSse( xx ) );
    }

    if( remainder != 0 )
    {
        __m128i v1, v2;
        // Load 1 .. 3 32-bit values into vectors
        switch( remainder )
        {
        case 1:
            v1 = _mm_cvtsi32_si128( *(const int*)( p1 ) );
            v2 = _mm_cvtsi32_si128( *(const int*)( p2 ) );
            break;
        case 2:
            v1 = _mm_cvtsi64_si128( *(const int64_t*)( p1 ) );
            v2 = _mm_cvtsi64_si128( *(const int64_t*)( p2 ) );
            break;
        case 3:
            v1 = _mm_cvtsi64_si128( *(const int64_t*)( p1 ) );
            v2 = _mm_cvtsi64_si128( *(const int64_t*)( p2 ) );
            v1 = _mm_insert_epi32( v1, *(const int*)( p1 + 2 ), 2 );
            v2 = _mm_insert_epi32( v2, *(const int*)( p2 + 2 ), 2 );
            break;
        }

        __m128i xx = _mm_xor_si128( v1, v2 );
        acc = _mm_add_epi64( acc, popcntSse( xx ) );
    }

    // Horizontally add both lanes in the accumulator
    uint64_t result = (uint64_t)_mm_cvtsi128_si64( acc ) + (uint64_t)_mm_extract_epi64( acc, 1 );

    // Convert to float;
    // note you will lose precision after about 16.7 millions of set bits due to FP32 on output.
    return (float)result;
}

Yeah, your observations are expected. Your code for Euclidean is more or less OK, but your code for Hamming is very inefficient.

Since you mentioned AVX1 but not AVX2, I assume you don’t have AVX2. In that case, I would do it like that, untested.

// Count set bits in every byte,
// add slices of 8 bytes together into a vector of two int64 lanes.
inline __m128i popcntSse( __m128i vec )
{
    // Isolate low and high 4 bit pieces from each byte
    const __m128i lowMask = _mm_set1_epi8( 0xF );
    __m128i a = _mm_and_si128( lowMask, vec );
    __m128i b = _mm_andnot_si128( lowMask, vec );
    b = _mm_srli_epi32( b, 4 );

    // Apply the lookup table
    const __m128i lookup = _mm_setr_epi8(
    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
    );
    a = _mm_shuffle_epi8( lookup, a );
    b = _mm_shuffle_epi8( lookup, b );

    // Add two pieces together; adding 2 numbers [ 0 .. 4 ] each, not gonna overflow
    __m128i res = _mm_add_epi8( a, b );

    // Return horizontal sum of bytes
    return _mm_sad_epu8( res, _mm_setzero_si128() );
}

static float computeHammingDistance( const float* p1, const float* p2, size_t count )
{
    const float* const p1EndAligned = p1 + ( count / 4 ) * 4;
    const size_t remainder = ( count % 4 );

    __m128i acc = _mm_setzero_si128();
    // Process most of these values doing 4 of them per iteration
    while( p1 < p1EndAligned )
    {
        __m128i v1 = _mm_loadu_si128( ( const __m128i* )p1 );
        __m128i v2 = _mm_loadu_si128( ( const __m128i* )p2 );
        p1 += 4;
        p2 += 4;
        __m128i xx = _mm_xor_si128( v1, v2 );
        acc = _mm_add_epi64( acc, popcntSse( xx ) );
    }

    if( remainder != 0 )
    {
        __m128i v1, v2;
        // Load 1 .. 3 32-bit values into vectors
        switch( remainder )
        {
        case 1:
            v1 = _mm_cvtsi32_si128( *(const int*)( p1 ) );
            v2 = _mm_cvtsi32_si128( *(const int*)( p2 ) );
            break;
        case 2:
            v1 = _mm_cvtsi64_si128( *(const int64_t*)( p1 ) );
            v2 = _mm_cvtsi64_si128( *(const int64_t*)( p2 ) );
            break;
        case 3:
            v1 = _mm_cvtsi64_si128( *(const int64_t*)( p1 ) );
            v2 = _mm_cvtsi64_si128( *(const int64_t*)( p2 ) );
            v1 = _mm_insert_epi32( v1, *(const int*)( p1 + 2 ), 2 );
            v2 = _mm_insert_epi32( v2, *(const int*)( p2 + 2 ), 2 );
            break;
        }

        __m128i xx = _mm_xor_si128( v1, v2 );
        acc = _mm_add_epi64( acc, popcntSse( xx ) );
    }

    // Horizontally add both lanes in the accumulator
    uint64_t result = (uint64_t)_mm_cvtsi128_si64( acc ) + (uint64_t)_mm_extract_epi64( acc, 1 );

    // Convert to float;
    // note you will lose precision after about 16.7 millions of set bits due to FP32 on output.
    return (float)result;
}

回复收藏 0 原文

~没有更多了~