有没有一种快速的方法将 8 个 ASCII 十进制数字字符串转换为二进制数?

发布于 2025-01-15 22:22:27 字数 508 浏览 1 评论 0原文

将 8 位字符(如 12345678)视为字符串。它可以转换为每个字节都包含一个数字的数字,如下所示:

const char* const str = "12345678";
const char* const base = "00000000";
const uint64_t unpacked = *reinterpret_cast<const uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

那么在小端系统上,unpacked 将是 0x0807060504030201

将数字转换为 12345678 的最快方法是什么,也许是将其乘以某个幻数或使用高达 AVX2 的 SIMD?

更新:12345678 必须是存储在 32 位或 64 位整数中的数字,而不是字符串。

Consider 8 digit characters like 12345678 as a string. It can be converted to a number where every byte contains a digit like this:

const char* const str = "12345678";
const char* const base = "00000000";
const uint64_t unpacked = *reinterpret_cast<const uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

Then unpacked will be 0x0807060504030201 on a little-endian system.

What is the fastest way to convert the number into 12345678, perhaps by multiplying it by some magic number or using SIMD up to AVX2?

UPDATE: 12345678 has to be a number stored in a 32-bit or 64-bit integer, not a string.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(4

独闯女儿国 2025-01-22 22:22:28

在没有 AVX2 的较旧 x86-64 系统上,这个基于以树方式收集数字的简单版本非常高效,根据我的测量,其性能与基于 SWAR 的简单实现相当。然而,这需要具有大量指令级并行性的处理器,因为在完全优化的情况下编译时,它包含的指令比基于 SWAR 的代码多 50%。

/* convert a string of exactly eight 'char' into a 32-bit unsigned integer */
uint32_t string_to_number (const char * s)
{
    uint32_t t0 = s[0] * 10 + s[1];
    uint32_t t1 = s[2] * 10 + s[3];
    uint32_t t2 = s[4] * 10 + s[5];
    uint32_t t3 = s[6] * 10 + s[7];
    uint32_t s0 = t0 * 100 + t1;
    uint32_t s1 = t2 * 100 + t3;
    uint32_t num = s0 * 10000 + s1;
    uint32_t corr =
        '0' * 10000000 +
        '0' * 1000000 +
        '0' * 100000 +
        '0' * 10000 +
        '0' * 1000 +
        '0' * 100 +
        '0' * 10 +
        '0' * 1;
    return num - corr;
}

On an older x86-64 system without AVX2, this simple version based on gathering digits in tree fashion is quite efficient, with performance on par with a simple SWAR-based implementation per my measurements. This requires a processor with a lot of instruction-level parallelism however, as it comprises 50% more instructions than the SWAR -based code when compiled with full optimizations.

/* convert a string of exactly eight 'char' into a 32-bit unsigned integer */
uint32_t string_to_number (const char * s)
{
    uint32_t t0 = s[0] * 10 + s[1];
    uint32_t t1 = s[2] * 10 + s[3];
    uint32_t t2 = s[4] * 10 + s[5];
    uint32_t t3 = s[6] * 10 + s[7];
    uint32_t s0 = t0 * 100 + t1;
    uint32_t s1 = t2 * 100 + t3;
    uint32_t num = s0 * 10000 + s1;
    uint32_t corr =
        '0' * 10000000 +
        '0' * 1000000 +
        '0' * 100000 +
        '0' * 10000 +
        '0' * 1000 +
        '0' * 100 +
        '0' * 10 +
        '0' * 1;
    return num - corr;
}
大海や 2025-01-22 22:22:28

如果您将输入格式更改为广度优先元素顺序,如下所示:

Sample 9 numbers, interleaved
digit[]:
1 1 1 1 1 1 1 1 1 ... 2 2 2 2 2 2 2 2 2 ...
... 3 3 3 3 3 3 3 3 3 ....

for(int j=0; j<num_parse; j+=9)
{
    for(int i=0; i<9; i++)
    {
        value[i] += 
          (multiplier[i]*=10)*
          (digit[i+j]-'0');
    }
    // value vector copied to output
    // clear value & multiplier vectors
}

如果您转换的值超过 9 个,例如 512 或 8192,并填充为 32 的任意倍数,则编译器应该对其进行矢量化。

要准备输入,您可以使用 8 个不同的通道,每个解析值的每个数字 1 个。

If you change your input format to breadth-first element order like this:

Sample 9 numbers, interleaved
digit[]:
1 1 1 1 1 1 1 1 1 ... 2 2 2 2 2 2 2 2 2 ...
... 3 3 3 3 3 3 3 3 3 ....

for(int j=0; j<num_parse; j+=9)
{
    for(int i=0; i<9; i++)
    {
        value[i] += 
          (multiplier[i]*=10)*
          (digit[i+j]-'0');
    }
    // value vector copied to output
    // clear value & multiplier vectors
}

And if you convert more than just 9 values, like 512 or 8192 with padding to any multiple of 32, compiler should vectorize it.

To prepare input, you can use 8 different channels, 1 per digit of every parsed value.

若水微香 2025-01-22 22:22:28

我已经实现了一个小程序来测试一些想法。 AVX2 实现比 naive 快约 1.5 倍,中间是表实现:

AVX2: 12345678 in 3.42759
Naive: 12345678 in 5.12581
Table: 12345678 in 4.49478

源代码:

#include <cstdlib>
#include <cstdint>
#include <immintrin.h>
#include <iostream>
using namespace std;

const __m256i mask = _mm256_set1_epi32(0xf);
const __m256i mul = _mm256_setr_epi32(10000000, 1000000, 100000, 10000, 1000, 100, 10, 1);

const volatile char* str = "12345678";
volatile uint32_t h;

const int64_t nIter = 1000LL * 1000LL * 1000LL;

inline void parse_avx2() {
  const char* const base = "00000000";
  const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

  const __m128i a = _mm_set1_epi64x(unpacked);
  const __m256i b = _mm256_cvtepu8_epi32(a);
  const __m256i d = _mm256_mullo_epi32(b, mul);
  const __m128i e = _mm_add_epi32(_mm256_extractf128_si256(d, 0), _mm256_extractf128_si256(d, 1));
  const uint64_t f0 = _mm_extract_epi64(e, 0);
  const uint64_t f1 = _mm_extract_epi64(e, 1);
  const uint64_t g = f0 + f1;
  h = (g>>32) + (g&0xffffffff);
}

inline void parse_naive() {
  const char* const base = "00000000";
  const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

  const uint8_t* a = reinterpret_cast<const uint8_t*>(&unpacked);
  h = a[7] + a[6]*10 + a[5]*100 + a[4]*1000 + a[3]*10000 + a[2]*100000 + a[1]*1000000 + a[0]*10000000;
}

uint32_t summands[8][10];
inline void parse_table() {
  const char* const base = "00000000";
  const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

  const uint8_t* a = reinterpret_cast<const uint8_t*>(&unpacked);
  h = summands[7][a[0]] + summands[6][a[1]] + summands[5][a[2]] + summands[4][a[3]]
    + summands[3][a[4]] + summands[2][a[5]] + summands[1][a[6]] + summands[0][a[7]];
}

int main() {
  clock_t start = clock();
  for(int64_t i=0; i<nIter; i++) {
    parse_avx2();
  }
  clock_t end = clock();
  cout << "AVX2: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;

  start = clock();
  for(int64_t i=0; i<nIter; i++) {
    parse_naive();
  }
  end = clock();
  cout << "Naive: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;

  uint32_t mul=1;
  for(int i=0; i<8; i++, mul*=10) {
    for(int j=0; j<9; j++) {
      summands[i][j] = j*mul;
    }
  }

  start = clock();
  for(int64_t i=0; i<nIter; i++) {
    parse_table();
  }
  end = clock();
  cout << "Table: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;
  return 0;
}

I've implemented a small program to test some ideas. AVX2 implementation is ~1.5 times faster than the naive, with the table implementation in the middle:

AVX2: 12345678 in 3.42759
Naive: 12345678 in 5.12581
Table: 12345678 in 4.49478

Source code:

#include <cstdlib>
#include <cstdint>
#include <immintrin.h>
#include <iostream>
using namespace std;

const __m256i mask = _mm256_set1_epi32(0xf);
const __m256i mul = _mm256_setr_epi32(10000000, 1000000, 100000, 10000, 1000, 100, 10, 1);

const volatile char* str = "12345678";
volatile uint32_t h;

const int64_t nIter = 1000LL * 1000LL * 1000LL;

inline void parse_avx2() {
  const char* const base = "00000000";
  const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

  const __m128i a = _mm_set1_epi64x(unpacked);
  const __m256i b = _mm256_cvtepu8_epi32(a);
  const __m256i d = _mm256_mullo_epi32(b, mul);
  const __m128i e = _mm_add_epi32(_mm256_extractf128_si256(d, 0), _mm256_extractf128_si256(d, 1));
  const uint64_t f0 = _mm_extract_epi64(e, 0);
  const uint64_t f1 = _mm_extract_epi64(e, 1);
  const uint64_t g = f0 + f1;
  h = (g>>32) + (g&0xffffffff);
}

inline void parse_naive() {
  const char* const base = "00000000";
  const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

  const uint8_t* a = reinterpret_cast<const uint8_t*>(&unpacked);
  h = a[7] + a[6]*10 + a[5]*100 + a[4]*1000 + a[3]*10000 + a[2]*100000 + a[1]*1000000 + a[0]*10000000;
}

uint32_t summands[8][10];
inline void parse_table() {
  const char* const base = "00000000";
  const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

  const uint8_t* a = reinterpret_cast<const uint8_t*>(&unpacked);
  h = summands[7][a[0]] + summands[6][a[1]] + summands[5][a[2]] + summands[4][a[3]]
    + summands[3][a[4]] + summands[2][a[5]] + summands[1][a[6]] + summands[0][a[7]];
}

int main() {
  clock_t start = clock();
  for(int64_t i=0; i<nIter; i++) {
    parse_avx2();
  }
  clock_t end = clock();
  cout << "AVX2: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;

  start = clock();
  for(int64_t i=0; i<nIter; i++) {
    parse_naive();
  }
  end = clock();
  cout << "Naive: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;

  uint32_t mul=1;
  for(int i=0; i<8; i++, mul*=10) {
    for(int j=0; j<9; j++) {
      summands[i][j] = j*mul;
    }
  }

  start = clock();
  for(int64_t i=0; i<nIter; i++) {
    parse_table();
  }
  end = clock();
  cout << "Table: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;
  return 0;
}
别把无礼当个性 2025-01-22 22:22:27

二进制乘法只是一系列移位和乘法运算。补充道。 SWAR 方法
应该不会太难理解。有关详细说明,请参阅:

// http://govnokod.ru/13461
static inline
uint32_t parse_8digits_swar_classic (char* str) {
    uint64_t v;

    memcpy(&v, str, 8);
    v = (v & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
    v = (v & 0x00FF00FF00FF00FF) * 6553601 >> 16;
    v = (v & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
    return v;
}

// attempt to improve the latency
static inline
uint32_t parse_8digits_swar_aqrit (char* str) {
    const uint64_t mask = 0x000000FF000000FF;
    uint64_t v, t;

    memcpy(&v, str, 8);
    v = (v * 10) + (v >> 8);
    t = (v & mask) * 0x000F424000000064;
    v = ((v >> 16) & mask) * 0x0000271000000001;
    v = (v + t + 0xFF0915C600000000ULL) >> 32;
    return v;
}

// SSSE3 needs less `shift & mask` operations...
static inline
uint32_t parse_8digits_simd_ssse3 (char* str) {
    const __m128i mul1 = _mm_set_epi32(0, 0, 0x010A0A64, 0x14C814C8);
    const __m128i mul2 = _mm_set_epi32(0, 0, 0x0001000A, 0x00FA61A8);
    const __m128i mask = _mm_set1_epi8(0x0F);
    __m128i v;

    v = _mm_loadl_epi64((__m128i*)(void*)str);
    v = _mm_and_si128(v, mask);
    v = _mm_madd_epi16(_mm_maddubs_epi16(mul1, v), mul2);
    v = _mm_add_epi32(_mm_add_epi32(v, v), _mm_shuffle_epi32(v, 1));
    return (uint32_t)_mm_cvtsi128_si32(v);
}

Multiplication in binary is just a series of shift & adds. A SWAR approach
shouldn't be too hard to understand. For a detailed walk-thru see:

// http://govnokod.ru/13461
static inline
uint32_t parse_8digits_swar_classic (char* str) {
    uint64_t v;

    memcpy(&v, str, 8);
    v = (v & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
    v = (v & 0x00FF00FF00FF00FF) * 6553601 >> 16;
    v = (v & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
    return v;
}

// attempt to improve the latency
static inline
uint32_t parse_8digits_swar_aqrit (char* str) {
    const uint64_t mask = 0x000000FF000000FF;
    uint64_t v, t;

    memcpy(&v, str, 8);
    v = (v * 10) + (v >> 8);
    t = (v & mask) * 0x000F424000000064;
    v = ((v >> 16) & mask) * 0x0000271000000001;
    v = (v + t + 0xFF0915C600000000ULL) >> 32;
    return v;
}

// SSSE3 needs less `shift & mask` operations...
static inline
uint32_t parse_8digits_simd_ssse3 (char* str) {
    const __m128i mul1 = _mm_set_epi32(0, 0, 0x010A0A64, 0x14C814C8);
    const __m128i mul2 = _mm_set_epi32(0, 0, 0x0001000A, 0x00FA61A8);
    const __m128i mask = _mm_set1_epi8(0x0F);
    __m128i v;

    v = _mm_loadl_epi64((__m128i*)(void*)str);
    v = _mm_and_si128(v, mask);
    v = _mm_madd_epi16(_mm_maddubs_epi16(mul1, v), mul2);
    v = _mm_add_epi32(_mm_add_epi32(v, v), _mm_shuffle_epi32(v, 1));
    return (uint32_t)_mm_cvtsi128_si32(v);
}
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文