当前位置：文江博客话题详情

c bit-manipulation 32-bit

查找位数组中设置的最高有效位（最左边）

发布于 2024-08-28 06:46:02 字数 168 浏览 11 评论 0原文

我有一个位数组实现，其中第 0 个索引是数组中第一个字节的 MSB，第 8 个索引是第二个字节的 MSB，等等...

找到在此设置的第一个位的快速方法是什么位数组？我查找过的所有相关解决方案都找到了第一个最低有效位，但我需要第一个最高有效位。因此，给定 0x00A1，我想要 8（因为它是左起第 9 位）。

收藏 0

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

评论（17）

想挽留 2024-09-04 06:46:02

GCC 有 __builtin_clz 转换为 x86/x64 上的 BSR、ARM 上的 CLZ 等，并在硬件未实现时模拟该指令。
Visual C++ 2005 及更高版本具有 _BitScanReverse。

回复收藏 0 原文

回忆那么伤 2024-09-04 06:46:02

tl：博士；对于 32 位，请使用 de Bruijn 乘法。

这是 “最快” 可移植算法。它比该线程中的所有其他可移植 32 位 MSB 算法更快、更正确。

当输入为零时，de Bruijn 算法也会返回正确的结果。 __builtin_clz 和 _BitScanReverse 指令返回不正确的结果输入为零。

在 Windows x86-64 上，de Bruijn 乘法的运行速度与等效（有缺陷的）Windows 函数相当，性能差异仅为 3% 左右。

这是代码。

u32 msbDeBruijn32( u32 v )
{
    static const int MultiplyDeBruijnBitPosition[32] =
    {
        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
    };

    v |= v >> 1; // first round down to one less than a power of 2
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    return MultiplyDeBruijnBitPosition[( u32 )( v * 0x07C4ACDDU ) >> 27];
}

该线程中的所有其他答案要么运行得比作者建议的要差得多，要么没有正确计算结果，或者两者兼而有之。让我们对它们进行基准测试，并验证它们是否按照它们声称的那样进行。

这是一个简单的 C++11 工具来测试所有这些实现。它在 Visual Studio 上编译干净，但应该适用于所有现代编译器。它允许您在性能模式 (bVerifyResults = false) 和检查模式 (bVerifyResults = true) 下运行基准测试。

以下是验证模式下的结果：

Verification failed for msbNative64: input was 0; output was 818af060; expected 0
Verification failed for msbFfs: input was 22df; output was 0; expected d
Verification failed for msbPerformanceJunkie32: input was 0; output was ffffffff; expected 0
Verification failed for msbNative32: input was 0; output was 9ab07060; expected 0

当输入为零时，“性能迷”和 Microsoft 本机实现会执行不同的操作。 msbPerformanceJunkie32 产生 -1，微软的 _BitScanReverse 产生一个随机数，与底层硬件指令一致。此外，msbPerformanceJunkie32 实现产生的结果与所有其他答案相差一。

以下是在我的 i7-4600 笔记本电脑上运行、在发布模式下编译的性能模式下的结果：

msbLoop64 took 2.56751 seconds               
msbNative64 took 0.222197 seconds            

msbLoop32 took 1.43456 seconds               
msbFfs took 0.525097 seconds                 
msbPerformanceJunkie32 took 1.07939 seconds  
msbDeBruijn32 took 0.224947 seconds          
msbNative32 took 0.218275 seconds

de Bruijn 版本完美地击败了其他实现，因为它是无分支的，因此它可以很好地应对以下输入：产生一组均匀分布的输出。由于现代 CPU 上分支错误预测的惩罚，所有其他版本对于任意输入都较慢。 smbFfs 函数会产生不正确的结果，因此可以忽略。

有些实现适用于 32 位输入，有些实现适用于 64 位输入。模板将帮助我们比较苹果与苹果，无论输入大小如何。

这是代码。如果您愿意，请自行下载并运行基准测试。

#include <iostream>
#include <chrono>
#include <random>
#include <cassert>
#include <string>
#include <limits>

#ifdef _MSC_VER
#define MICROSOFT_COMPILER 1
#include <intrin.h>
#endif // _MSC_VER

const int iterations = 100000000;
bool bVerifyResults = false;
std::random_device rd;
std::default_random_engine re(rd());
typedef unsigned int u32;
typedef unsigned long long u64;

class Timer
{
public:
    Timer() : beg_(clock_::now()) {}
    void reset() {
        beg_ = clock_::now();
    }
    double elapsed() const {
        return std::chrono::duration_cast<second_>
            (clock_::now() - beg_).count();
    }

private:
    typedef std::chrono::high_resolution_clock clock_;
    typedef std::chrono::duration<double, std::ratio<1> > second_;
    std::chrono::time_point<clock_> beg_;
};

unsigned int msbPerformanceJunkie32(u32 x)
{
    static const unsigned int bval[] =
    { 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 };
    unsigned int r = 0;
    if (x & 0xFFFF0000) {
        r += 16 / 1;
        x >>= 16 / 1;
    }
    if (x & 0x0000FF00) {
        r += 16 / 2;
        x >>= 16 / 2;
    }
    if (x & 0x000000F0) {
        r += 16 / 4;
        x >>= 16 / 4;
    }
    return r + bval[x];
}

#define FFS(t)  \
{ \
register int n = 0; \
if (!(0xffff & t)) \
n += 16; \
if (!((0xff << n) & t)) \
n += 8; \
if (!((0xf << n) & t)) \
n += 4; \
if (!((0x3 << n) & t)) \
n += 2; \
if (!((0x1 << n) & t)) \
n += 1; \
return n; \
}

unsigned int msbFfs32(u32 x)
{
    FFS(x);
}

unsigned int msbLoop32(u32 x)
{
    int r = 0;
    if (x < 1) return 0;
    while (x >>= 1) r++;
    return r;
}

unsigned int msbLoop64(u64 x)
{
    int r = 0;
    if (x < 1) return 0;
    while (x >>= 1) r++;
    return r;
}

u32 msbDeBruijn32(u32 v)
{
    static const int MultiplyDeBruijnBitPosition[32] =
    {
        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
    };

    v |= v >> 1; // first round down to one less than a power of 2
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    return MultiplyDeBruijnBitPosition[(u32)(v * 0x07C4ACDDU) >> 27];
}

#ifdef MICROSOFT_COMPILER
u32 msbNative32(u32 val)
{
    unsigned long result;
    _BitScanReverse(&result, val);
    return result;
}
u32 msbNative64(u64 val)
{
    unsigned long result;
    _BitScanReverse64(&result, val);
    return result;
}
#endif // MICROSOFT_COMPILER

template <typename InputType>
void test(unsigned int msbFunc(InputType),
    const std::string &name,
    const std::vector< InputType > &inputs,
    std::vector< unsigned int > &results,
    bool bIsReference = false
)
{
    if (bIsReference)
    {
        int i = 0;
        for (int i = 0; i < iterations; i++)
            results[i] = msbFunc(inputs[i]);
    }
    InputType result;
    if (bVerifyResults)
    {
        bool bNotified = false;
        for (int i = 0; i < iterations; i++)
        {
            result = msbFunc(inputs[i]);
            if ((result != results[i]) && !bNotified)
            {
                std::cout << "Verification failed for " << name << ": "
                    << "input was " << std::hex << inputs[i]
                    << "; output was " << result
                    << "; expected " << results[i]
                    << std::endl;
                bNotified = true;
            }
        }
    }
    else
    {
        Timer t;
        for (int i = 0; i < iterations; i++)
        {
            result = msbFunc(inputs[i]);
        }
        double elapsed = t.elapsed();
        if ( !bIsReference )
            std::cout << name << " took " << elapsed << " seconds" << std::endl;
        if (result == -1.0f)
            std::cout << "this comparison only exists to keep the compiler from " <<
            "optimizing out the benchmark; this branch will never be called";
    }
}

void main()
{
    std::uniform_int_distribution <u64> dist64(0,
        std::numeric_limits< u64 >::max());
    std::uniform_int_distribution <u32> shift64(0, 63);
    std::vector< u64 > inputs64;
    for (int i = 0; i < iterations; i++)
    {
        inputs64.push_back(dist64(re) >> shift64(re));
    }
    std::vector< u32 > results64;
    results64.resize(iterations);

    test< u64 >(msbLoop64, "msbLoop64", inputs64, results64, true);
    test< u64 >(msbLoop64, "msbLoop64", inputs64, results64, false);
#ifdef MICROSOFT_COMPILER
    test< u64 >(msbNative64, "msbNative64", inputs64, results64, false);
#endif // MICROSOFT_COMPILER
    std::cout << std::endl;

    std::uniform_int_distribution <u32> dist32(0,
        std::numeric_limits< u32 >::max());
    std::uniform_int_distribution <u32> shift32(0, 31);
    std::vector< u32 > inputs32;
    for (int i = 0; i < iterations; i++)
        inputs32.push_back(dist32(re) >> shift32(re));
    std::vector< u32 > results32;
    results32.resize(iterations);


    test< u32 >(msbLoop32, "msbLoop32", inputs32, results32, true);

    test< u32 >(msbLoop32, "msbLoop32", inputs32, results32, false);
    test< u32 >(msbFfs32, "msbFfs", inputs32, results32, false);
    test< u32 >(msbPerformanceJunkie32, "msbPerformanceJunkie32",
        inputs32, results32, false);
    test< u32 >(msbDeBruijn32, "msbDeBruijn32", inputs32, results32, false);
#ifdef MICROSOFT_COMPILER
    test< u32 >(msbNative32, "msbNative32", inputs32, results32, false);
#endif // MICROSOFT_COMPILER
}

tl:dr; For 32 bits, use de Bruijn multiplication.

It's the "fastest" portable algorithm. It is substantially faster and more correct than all the other portable 32-bit MSB algorithms in this thread.

The de Bruijn algorithm also returns a correct result when the input is zero. The __builtin_clz and _BitScanReverse instructions return incorrect results when the input is zero.

On Windows x86-64, de Bruijn multiplication runs at a speed comparable to the equivalent (flawed) Windows function, with a performance difference of only around 3%.

Here's the code.

u32 msbDeBruijn32( u32 v )
{
    static const int MultiplyDeBruijnBitPosition[32] =
    {
        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
    };

    v |= v >> 1; // first round down to one less than a power of 2
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    return MultiplyDeBruijnBitPosition[( u32 )( v * 0x07C4ACDDU ) >> 27];
}

All the other answers in this thread either run much more poorly than their authors suggest, or don't calculate the result correctly, or both. Let's benchmark them all, and let's verify that they do what they claim to do.

Here's a simple C++11 harness to test all these implementations. It compiles clean on Visual Studio but should work on all modern compilers. It allows you to run the benchmark in performance mode (bVerifyResults = false) and in checking mode (bVerifyResults = true).

Here are the results in verification mode:

Verification failed for msbNative64: input was 0; output was 818af060; expected 0
Verification failed for msbFfs: input was 22df; output was 0; expected d
Verification failed for msbPerformanceJunkie32: input was 0; output was ffffffff; expected 0
Verification failed for msbNative32: input was 0; output was 9ab07060; expected 0

The "performance junkie" and the Microsoft native implementations do different things when the input is zero. msbPerformanceJunkie32 produces -1, and Microsoft's _BitScanReverse produces a random number, consistent with the underlying hardware instruction. Also the msbPerformanceJunkie32 implementation produces a result that is off by one from all the other answers.

Here are the results in performance mode, running on my i7-4600 laptop, compiled in release mode:

msbLoop64 took 2.56751 seconds               
msbNative64 took 0.222197 seconds            

msbLoop32 took 1.43456 seconds               
msbFfs took 0.525097 seconds                 
msbPerformanceJunkie32 took 1.07939 seconds  
msbDeBruijn32 took 0.224947 seconds          
msbNative32 took 0.218275 seconds

The de Bruijn version beats the other implementations soundly because it is branchless, and therefore it runs well against inputs that produce an evenly distributed set of outputs. All the other versions are slower against arbitrary inputs because of the penalties of branch misprediction on modern CPUs. The smbFfs function produces incorrect results so it can be ignored.

Some of the implementations work on 32 bit inputs, and some work on 64 bit inputs. A template will help us compare apples to apples, regardless of the input size.

Here's the code. Download and run the benchmarks yourself if you like.

#include <iostream>
#include <chrono>
#include <random>
#include <cassert>
#include <string>
#include <limits>

#ifdef _MSC_VER
#define MICROSOFT_COMPILER 1
#include <intrin.h>
#endif // _MSC_VER

const int iterations = 100000000;
bool bVerifyResults = false;
std::random_device rd;
std::default_random_engine re(rd());
typedef unsigned int u32;
typedef unsigned long long u64;

class Timer
{
public:
    Timer() : beg_(clock_::now()) {}
    void reset() {
        beg_ = clock_::now();
    }
    double elapsed() const {
        return std::chrono::duration_cast<second_>
            (clock_::now() - beg_).count();
    }

private:
    typedef std::chrono::high_resolution_clock clock_;
    typedef std::chrono::duration<double, std::ratio<1> > second_;
    std::chrono::time_point<clock_> beg_;
};

unsigned int msbPerformanceJunkie32(u32 x)
{
    static const unsigned int bval[] =
    { 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 };
    unsigned int r = 0;
    if (x & 0xFFFF0000) {
        r += 16 / 1;
        x >>= 16 / 1;
    }
    if (x & 0x0000FF00) {
        r += 16 / 2;
        x >>= 16 / 2;
    }
    if (x & 0x000000F0) {
        r += 16 / 4;
        x >>= 16 / 4;
    }
    return r + bval[x];
}

#define FFS(t)  \
{ \
register int n = 0; \
if (!(0xffff & t)) \
n += 16; \
if (!((0xff << n) & t)) \
n += 8; \
if (!((0xf << n) & t)) \
n += 4; \
if (!((0x3 << n) & t)) \
n += 2; \
if (!((0x1 << n) & t)) \
n += 1; \
return n; \
}

unsigned int msbFfs32(u32 x)
{
    FFS(x);
}

unsigned int msbLoop32(u32 x)
{
    int r = 0;
    if (x < 1) return 0;
    while (x >>= 1) r++;
    return r;
}

unsigned int msbLoop64(u64 x)
{
    int r = 0;
    if (x < 1) return 0;
    while (x >>= 1) r++;
    return r;
}

u32 msbDeBruijn32(u32 v)
{
    static const int MultiplyDeBruijnBitPosition[32] =
    {
        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
    };

    v |= v >> 1; // first round down to one less than a power of 2
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    return MultiplyDeBruijnBitPosition[(u32)(v * 0x07C4ACDDU) >> 27];
}

#ifdef MICROSOFT_COMPILER
u32 msbNative32(u32 val)
{
    unsigned long result;
    _BitScanReverse(&result, val);
    return result;
}
u32 msbNative64(u64 val)
{
    unsigned long result;
    _BitScanReverse64(&result, val);
    return result;
}
#endif // MICROSOFT_COMPILER

template <typename InputType>
void test(unsigned int msbFunc(InputType),
    const std::string &name,
    const std::vector< InputType > &inputs,
    std::vector< unsigned int > &results,
    bool bIsReference = false
)
{
    if (bIsReference)
    {
        int i = 0;
        for (int i = 0; i < iterations; i++)
            results[i] = msbFunc(inputs[i]);
    }
    InputType result;
    if (bVerifyResults)
    {
        bool bNotified = false;
        for (int i = 0; i < iterations; i++)
        {
            result = msbFunc(inputs[i]);
            if ((result != results[i]) && !bNotified)
            {
                std::cout << "Verification failed for " << name << ": "
                    << "input was " << std::hex << inputs[i]
                    << "; output was " << result
                    << "; expected " << results[i]
                    << std::endl;
                bNotified = true;
            }
        }
    }
    else
    {
        Timer t;
        for (int i = 0; i < iterations; i++)
        {
            result = msbFunc(inputs[i]);
        }
        double elapsed = t.elapsed();
        if ( !bIsReference )
            std::cout << name << " took " << elapsed << " seconds" << std::endl;
        if (result == -1.0f)
            std::cout << "this comparison only exists to keep the compiler from " <<
            "optimizing out the benchmark; this branch will never be called";
    }
}

void main()
{
    std::uniform_int_distribution <u64> dist64(0,
        std::numeric_limits< u64 >::max());
    std::uniform_int_distribution <u32> shift64(0, 63);
    std::vector< u64 > inputs64;
    for (int i = 0; i < iterations; i++)
    {
        inputs64.push_back(dist64(re) >> shift64(re));
    }
    std::vector< u32 > results64;
    results64.resize(iterations);

    test< u64 >(msbLoop64, "msbLoop64", inputs64, results64, true);
    test< u64 >(msbLoop64, "msbLoop64", inputs64, results64, false);
#ifdef MICROSOFT_COMPILER
    test< u64 >(msbNative64, "msbNative64", inputs64, results64, false);
#endif // MICROSOFT_COMPILER
    std::cout << std::endl;

    std::uniform_int_distribution <u32> dist32(0,
        std::numeric_limits< u32 >::max());
    std::uniform_int_distribution <u32> shift32(0, 31);
    std::vector< u32 > inputs32;
    for (int i = 0; i < iterations; i++)
        inputs32.push_back(dist32(re) >> shift32(re));
    std::vector< u32 > results32;
    results32.resize(iterations);


    test< u32 >(msbLoop32, "msbLoop32", inputs32, results32, true);

    test< u32 >(msbLoop32, "msbLoop32", inputs32, results32, false);
    test< u32 >(msbFfs32, "msbFfs", inputs32, results32, false);
    test< u32 >(msbPerformanceJunkie32, "msbPerformanceJunkie32",
        inputs32, results32, false);
    test< u32 >(msbDeBruijn32, "msbDeBruijn32", inputs32, results32, false);
#ifdef MICROSOFT_COMPILER
    test< u32 >(msbNative32, "msbNative32", inputs32, results32, false);
#endif // MICROSOFT_COMPILER
}

回复收藏 0 原文

于我来说 2024-09-04 06:46:02

作为一个性能迷，我尝试了很多 MSB 集的变体，以下是我遇到的最快的，

unsigned int msb32(unsigned int x)
{
    static const unsigned int bval[] =
    {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};

    unsigned int r = 0;
    if (x & 0xFFFF0000) { r += 16/1; x >>= 16/1; }
    if (x & 0x0000FF00) { r += 16/2; x >>= 16/2; }
    if (x & 0x000000F0) { r += 16/4; x >>= 16/4; }
    return r + bval[x];
}

As a performance junkie I have tried a ton of variations for MSB set, the following is the fastest I have come across,

unsigned int msb32(unsigned int x)
{
    static const unsigned int bval[] =
    {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};

    unsigned int r = 0;
    if (x & 0xFFFF0000) { r += 16/1; x >>= 16/1; }
    if (x & 0x0000FF00) { r += 16/2; x >>= 16/2; }
    if (x & 0x000000F0) { r += 16/4; x >>= 16/4; }
    return r + bval[x];
}

回复收藏 0 原文

‘画卷フ 2024-09-04 06:46:02

有多种方法可以做到这一点，并且不同实现的相对性能在某种程度上取决于机器（我碰巧出于类似目的在某种程度上对此进行了基准测试）。在某些机器上甚至有一个内置指令（如果可用并且可以处理可移植性，请使用一个指令）。

在此处查看一些实现（在“以 2 为底的整数对数”下）。如果您使用的是 GCC，请查看函数 __builtin_clz 和 __builtin_clzl （它们分别对非零无符号整数和无符号长整数执行此操作）。 “clz”代表“计算前导零”，这是描述同一问题的另一种方式。

当然，如果您的位数组不适合合适的机器字，则需要迭代数组中的字以找到第一个非零字，然后仅对该字执行此计算。

回复收藏 0 原文

奢华的一滴泪 2024-09-04 06:46:02

查找 BSR（位扫描反向）x86 asm 指令以获得执行此操作的最快方法。来自英特尔的文档：
在源操作数（第二个操作数）中搜索最高有效设置位（1 位）。如果找到最高有效 1 位，则其位索引存储在目标操作数中（第一个操作数）。

回复收藏 0 原文

楠木可依 2024-09-04 06:46:02

http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious< /a>

回复收藏 0 原文

家住魔仙堡 2024-09-04 06:46:02

我使用了许多函数来获取最高有效位，但在 32 和 64 位数字之间移动或在 x86_64 和 x86 框之间移动时通常会出现问题。函数 __builtin_clz、__builtin_clzl 和 __builtin_clzll 适用于 32/64 位数字以及跨 x86_64 和 x86 机器。然而，需要三个功能。我发现了一个简单的 MSB，它依赖于右移，可以处理所有正数情况。至少就我对它的使用而言，它在其他人失败的地方取得了成功：

int
getmsb (unsigned long long x)
{
    int r = 0;
    if (x < 1) return 0;
    while (x >>= 1) r++;
    return r;
}

通过将输入指定为unsigned long long，它可以处理从unsigned char到unsigned char的所有数字类。 code>unsigned long long 并给出标准定义，它在 x86_64 和 x86 版本之间兼容。 0 的大小写被定义为返回 0，但可以根据需要进行更改。一个简单的测试和输出是：

int
main (int argc, char *argv[]) {

    unsigned char c0 = 0;
    unsigned char c = 216;
    unsigned short s = 1021;
    unsigned int ui = 32768;
    unsigned long ul = 3297381253;
    unsigned long long ull = 323543844043;

    int i = 32767;

    printf ("  %16u  MSB : %d\n", c0, getmsb (c0));
    printf ("  %16u  MSB : %d\n", c, getmsb (c));
    printf ("  %16u  MSB : %d\n", s, getmsb (s));
    printf ("  %16u  MSB : %d\n", i, getmsb (i));
    printf ("  %16u  MSB : %d\n", ui, getmsb (ui));
    printf ("  %16lu  MSB : %d\n", ul, getmsb (ul));
    printf ("  %16llu  MSB : %d\n", ull, getmsb (ull));

    return 0;
}

输出：

             0  MSB : 0
           216  MSB : 7
          1021  MSB : 9
         32767  MSB : 14
         32768  MSB : 15
    3297381253  MSB : 31
  323543844043  MSB : 38

注意：出于速度考虑，使用以 __builtin_clzll 为中心的单个函数来完成相同的事情仍然快大约 6 倍。

I have worked with a number of functions to get the most significant bit, but problems generally arise moving between 32 and 64 bit numbers or moving between x86_64 and x86 boxes. The functions __builtin_clz, __builtin_clzl and __builtin_clzll work well for 32/64 bit numbers and across x86_64 and x86 machines. However, three functions are required. I have found a simple MSB that relies on right-shift that will handle all cases for positive numbers. At least for the use I make of it, it has succeeded where others have failed:

int
getmsb (unsigned long long x)
{
    int r = 0;
    if (x < 1) return 0;
    while (x >>= 1) r++;
    return r;
}

By designating input as unsigned long long it can handle all number classes from unsigned char to unsigned long long and given the standard definition, it is compatible across x86_64 and x86 builds. The case for 0 is defined to return 0, but can be changed as required. A simple test and output are:

int
main (int argc, char *argv[]) {

    unsigned char c0 = 0;
    unsigned char c = 216;
    unsigned short s = 1021;
    unsigned int ui = 32768;
    unsigned long ul = 3297381253;
    unsigned long long ull = 323543844043;

    int i = 32767;

    printf ("  %16u  MSB : %d\n", c0, getmsb (c0));
    printf ("  %16u  MSB : %d\n", c, getmsb (c));
    printf ("  %16u  MSB : %d\n", s, getmsb (s));
    printf ("  %16u  MSB : %d\n", i, getmsb (i));
    printf ("  %16u  MSB : %d\n", ui, getmsb (ui));
    printf ("  %16lu  MSB : %d\n", ul, getmsb (ul));
    printf ("  %16llu  MSB : %d\n", ull, getmsb (ull));

    return 0;
}

Output:

             0  MSB : 0
           216  MSB : 7
          1021  MSB : 9
         32767  MSB : 14
         32768  MSB : 15
    3297381253  MSB : 31
  323543844043  MSB : 38

NOTE: for speed considerations, using a single function to accomplish the same thing centered around __builtin_clzll is still faster by a factor of about 6.

回复收藏 0 原文

国产ˉ祖宗 2024-09-04 06:46:02

x86 有一个 BSR 指令，它返回一个位索引（而不是其上方的前导零的数量）。

但不幸的是，没有可移植的内在函数可以有效为所有编译器公开它。 GNU C 提供了 __builtin_clz，但是 unsigned bitidx = 31 - __builtin_clz(x); 不会在当前的 GCC 和 ICC 中优化回 BSR。（它与 clang 一起使用，这证明了表达式是等价的，所以它可以）。

下面定义了 BSR32() 和 BSR64() 宏或函数，它们可以有效地编译为 bsr 指令x86。（如果输入为零，则产生垃圾结果。内部函数无法利用 asm 指令的行为，即在输入 = 0 时不修改目标。）

向非 x86 的可移植性需要一些额外的 #ifdef 例如回退到 31-__builtin_clz。大多数非 x86 ISA，如果它们有前导零位扫描，则会计算前导零而不是提供位索引。这就是 GNU C 将 __builtin_clz 定义为可移植内置函数的原因。（如果目标系统上没有硬件支持，内置函数将编译为软件模拟，通常调用 libgcc 辅助函数。）

#include <stdint.h>

// define BSR32() and BSR64()
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
    #ifdef __INTEL_COMPILER
        typedef unsigned int bsr_idx_t;
    #else
        #include <intrin.h>   // MSVC
        typedef unsigned long bsr_idx_t;
    #endif

    static inline
    unsigned BSR32(unsigned long x){
        bsr_idx_t idx;
        _BitScanReverse(&idx, x); // ignore bool retval
        return idx;
    }
    static inline
    unsigned BSR64(uint64_t x) {
        bsr_idx_t idx;
        _BitScanReverse64(&idx, x); // ignore bool retval
        return idx;
    }
#elif defined(__GNUC__)

  #ifdef __clang__
    static inline unsigned BSR64(uint64_t x) {
        return 63-__builtin_clzll(x);
      // gcc/ICC can't optimize this back to just BSR, but clang can and doesn't provide alternate intrinsics
    }
  #else
    #define BSR64 __builtin_ia32_bsrdi
  #endif

    #include <x86intrin.h>
    #define BSR32(x) _bit_scan_reverse(x)

#endif

bsf 可能不需要编译器那么多的帮助，因为内置函数匹配asm 指令返回 LSB 位索引的行为，即尾随零的计数。

测试调用者 unsigned test32(unsigned x) { return BSR32(x); } inlines it to 1 instruction on all the major x86 compilers, 在 Godbolt 编译器浏览器上。 BSR64 以相同的方式内联到 64 位操作数大小的版本。另请参阅是否有 x86/x86_64 指令将最高有效位以下的所有位清零？例如用例。

;; x64 MSVC 19.16 -O2
unsigned int test32(unsigned int) PROC                                    ; test32, COMDAT
        bsr     eax, ecx
        ret     0
unsigned int test32(unsigned int) ENDP                                    ; test32

# clang -O3 -march=haswell   is too "smart?" for its own good:
test32(unsigned int):
        lzcnt   eax, edi
        xor     eax, 31
        ret

# gcc8.2 -O3 -march=haswell
test32(unsigned int):
        bsr     eax, edi
        ret

# ICC19 -O3 -march=haswell
test32(unsigned int):
        bsr       eax, edi                                      #15.9
        ret                                                     #41.12

这样做的目的是避免可移植（到非 MSVC）版本中的代码缓慢：

#ifdef __GNUC__
unsigned badgcc(uint64_t x) {
    return 63 - __builtin_clzll(x);
}
#endif

如果没有 -march=haswell，我们只能从 clang 获得 BSR，但是：

# gcc8.2 -O3
badgcc(unsigned long):
        bsr     rdi, rdi
        mov     eax, 63
        xor     rdi, 63
        sub     eax, edi
        ret

# ICC19.0.1 -O3
badgcc(unsigned long):
        mov       rax, -1                                       #46.17
        bsr       rdx, rdi                                      #46.17
        cmove     rdx, rax                                      #46.17
        neg       rdx                                           #46.17
        add       rdx, 63                                       #46.17
        neg       edx                                           #46.17
        add       edx, 63                                       #46.17
        mov       eax, edx                                      #46.17
        ret                                                     #46.17

这只是可恶的。（有趣的是，如果输入为零，ICC 会执行 CMOV 来生成 -1。BSR 根据其输入设置 ZF，这与大多数根据结果。）

使用 -march=haswell （或以其他方式启用 BMI1 指令），它没有那么糟糕，但仍然不如 BSR 好。模输出依赖项，编译器在 lzcnt 中主要是为了避免这种依赖项，但奇怪的是 BSR 却没有。（由于 input=0 行为，输出依赖项是 true 依赖项。）为什么打破 LZCNT 的“输出依赖”很重要？

x86 has a BSR instruction that returns a bit-index (rather than the count of leading zeros above it).

But unfortunately there's no portable intrinsic that efficiently exposes it for all compilers. GNU C provides __builtin_clz, but unsigned bitidx = 31 - __builtin_clz(x); doesn't optimize back to just BSR with current GCC and ICC. (It does with clang, which proves that the expression is equivalent so it could).

The following defines BSR32() and BSR64() macros or functions that compile efficiently to just a bsr instruction on x86. (Producing a garbage result if the input was zero. There's no way with intrinsics to take advantage of the asm instruction's behaviour of leaving the destination unmodified for input=0.)

Portability to non-x86 would take some additional #ifdef e.g. to fall back to 31-__builtin_clz. Most non-x86 ISAs, if they have a leading-zero bitscan at all, count leading zeros instead of giving you the bit-index. That's why GNU C defines __builtin_clz as the portable builtin. (If there's no HW support on the target system, the builtin will compile to software emulation, usually calling a libgcc helper function.)

#include <stdint.h>

// define BSR32() and BSR64()
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
    #ifdef __INTEL_COMPILER
        typedef unsigned int bsr_idx_t;
    #else
        #include <intrin.h>   // MSVC
        typedef unsigned long bsr_idx_t;
    #endif

    static inline
    unsigned BSR32(unsigned long x){
        bsr_idx_t idx;
        _BitScanReverse(&idx, x); // ignore bool retval
        return idx;
    }
    static inline
    unsigned BSR64(uint64_t x) {
        bsr_idx_t idx;
        _BitScanReverse64(&idx, x); // ignore bool retval
        return idx;
    }
#elif defined(__GNUC__)

  #ifdef __clang__
    static inline unsigned BSR64(uint64_t x) {
        return 63-__builtin_clzll(x);
      // gcc/ICC can't optimize this back to just BSR, but clang can and doesn't provide alternate intrinsics
    }
  #else
    #define BSR64 __builtin_ia32_bsrdi
  #endif

    #include <x86intrin.h>
    #define BSR32(x) _bit_scan_reverse(x)

#endif

bsf probably doesn't need as much help for compilers, because the builtin matches the asm instruction's behaviour of returning the bit-index of the LSB, i.e. the count of trailing zeros.

A test caller unsigned test32(unsigned x) { return BSR32(x); } inlines it to 1 instruction on all the major x86 compilers, on the Godbolt compiler explorer. BSR64 inlines the same way, to a 64-bit operand-size version. See also Is there an x86/x86_64 instruction which zeros all bits below the Most Significant Bit? for example use-cases.

;; x64 MSVC 19.16 -O2
unsigned int test32(unsigned int) PROC                                    ; test32, COMDAT
        bsr     eax, ecx
        ret     0
unsigned int test32(unsigned int) ENDP                                    ; test32

# clang -O3 -march=haswell   is too "smart?" for its own good:
test32(unsigned int):
        lzcnt   eax, edi
        xor     eax, 31
        ret

# gcc8.2 -O3 -march=haswell
test32(unsigned int):
        bsr     eax, edi
        ret

# ICC19 -O3 -march=haswell
test32(unsigned int):
        bsr       eax, edi                                      #15.9
        ret                                                     #41.12

The point of this is to avoid slow code from the portable (to non-MSVC) version:

#ifdef __GNUC__
unsigned badgcc(uint64_t x) {
    return 63 - __builtin_clzll(x);
}
#endif

Without -march=haswell we get just BSR from clang, but:

# gcc8.2 -O3
badgcc(unsigned long):
        bsr     rdi, rdi
        mov     eax, 63
        xor     rdi, 63
        sub     eax, edi
        ret

# ICC19.0.1 -O3
badgcc(unsigned long):
        mov       rax, -1                                       #46.17
        bsr       rdx, rdi                                      #46.17
        cmove     rdx, rax                                      #46.17
        neg       rdx                                           #46.17
        add       rdx, 63                                       #46.17
        neg       edx                                           #46.17
        add       edx, 63                                       #46.17
        mov       eax, edx                                      #46.17
        ret                                                     #46.17

That's just nasty. (Interesting to see that ICC is doing a CMOV to produce -1 if the input is zero. BSR sets ZF according to its input, unlike most instructions which set flags according to the result.)

With -march=haswell (or otherwise enabling use of BMI1 instructions), it's not as bad, but still not as good as just BSR. Modulo output dependencies, which compilers mostly work to avoid for lzcnt but strangely not for BSR. (Where the output dependency is a true dependency, because of the input=0 behaviour.) Why does breaking the "output dependency" of LZCNT matter?

回复收藏 0 原文

吃→可爱长大的 2024-09-04 06:46:02

如果您使用的是 x86，您可以使用 SSE2 操作结合查找第一位指令（在 gcc 世界中）发音为“ffs”来击败几乎任何逐字节或逐字解决方案” 代表最低位，“fls” 代表最高位。
请原谅我在回答中格式化“C”代码时遇到麻烦（！@#$%^）；查看：
http://mischasan.wordpress。 com/2011/11/03/sse2-bit-trick-ffsfls-for-xmm-registers/

回复收藏 0 原文

ペ泪落弦音 2024-09-04 06:46:02

我知道在纯 C 中执行此操作的两种最佳方法：

首先线性搜索字节/字数组以查找第一个非零的字节/字，然后对找到的字节/字进行展开的二进制搜索。

if (b>=0x10)
  if (b>=0x40)
    if (b>=0x80) return 0;
    else return 1;
  else
    if (b>=0x20) return 2;
    else return 3;
else
  if (b>=0x4)
    if (b>=0x8) return 4;
    else return 5;
  else
    if (b>=0x2) return 6;
    else return 7;

3（顺便说一句，即 log2(8)）条件跳转以获得答案。在现代 x86 机器上，最后一个将被优化为条件 mov。

或者，使用查找表将字节映射到所设置的第一位的索引。

您可能想要查找的相关主题是整数 log2 函数。如果我记得的话，ffmpeg 有一个很好的实现。

编辑：您实际上可以将上述二分搜索变成无分支二分搜索，但我不确定在这种情况下它是否会更有效......

Two best ways I know to do this in pure C:

First linear-search the byte/word array to find the first byte/word that's nonzero, then do an unrolled binary-search of the byte/word you find.

if (b>=0x10)
  if (b>=0x40)
    if (b>=0x80) return 0;
    else return 1;
  else
    if (b>=0x20) return 2;
    else return 3;
else
  if (b>=0x4)
    if (b>=0x8) return 4;
    else return 5;
  else
    if (b>=0x2) return 6;
    else return 7;

3 (BTW that's log2(8)) conditional jumps to get the answer. On modern x86 machines the last one will be optimized to a conditional mov.

Alternatively, use a lookup table to map the byte to the index of the first bit that's set.

A related topic you might want to look up is integer log2 functions. If I recall, ffmpeg has a nice implementation.

Edit: You can actually make the above binary search into a branchless binary search, but I'm not sure if it would be more efficient in this case...

回复收藏 0 原文

乄_柒ぐ汐 2024-09-04 06:46:02

不是最快的，但它有效......

//// C program
#include <math.h>

#define POS_OF_HIGHESTBIT(a) /* 0th position is the Least-Signif-Bit */    \
((unsigned) log2(a))         /* thus: do not use if a <= 0 */  

#define NUM_OF_HIGHESTBIT(a) ((!(a))          \
        ? 0 /* no msb set*/                   \
        : (1 << POS_OF_HIGHESTBIT(a) ))
// could be changed and optimized, if it is known that the following NEVER holds: a <= 0



int main()
{
  unsigned a = 5; // 0b101
  unsigned b = NUM_OF_HIGHESTBIT(a); // 4 since 4 = 0b100
  return 0; 
}

Not the fastest, but it works...

//// C program
#include <math.h>

#define POS_OF_HIGHESTBIT(a) /* 0th position is the Least-Signif-Bit */    \
((unsigned) log2(a))         /* thus: do not use if a <= 0 */  

#define NUM_OF_HIGHESTBIT(a) ((!(a))          \
        ? 0 /* no msb set*/                   \
        : (1 << POS_OF_HIGHESTBIT(a) ))
// could be changed and optimized, if it is known that the following NEVER holds: a <= 0



int main()
{
  unsigned a = 5; // 0b101
  unsigned b = NUM_OF_HIGHESTBIT(a); // 4 since 4 = 0b100
  return 0; 
}

回复收藏 0 原文

余生一个溪 2024-09-04 06:46:02

这是解释 __builtin_clz() 的代码片段

////// go.c ////////
#include <stdio.h>

unsigned NUM_BITS_U = ((sizeof(unsigned) << 3) - 1);
#define POS_OF_HIGHESTBITclz(a) (NUM_BITS_U - __builtin_clz(a)) /* only works for a != 0 */

#define NUM_OF_HIGHESTBITclz(a) ((a)                                \
                             ? (1U << POS_OF_HIGHESTBITclz(a))      \
                             : 0)


int main()
{
  unsigned ui;

  for (ui = 0U; ui < 18U; ++ui)
    printf("%i \t %i\n", ui, NUM_OF_HIGHESTBITclz(ui));

  return 0;
}

Here's a code snippet explaining __builtin_clz()

////// go.c ////////
#include <stdio.h>

unsigned NUM_BITS_U = ((sizeof(unsigned) << 3) - 1);
#define POS_OF_HIGHESTBITclz(a) (NUM_BITS_U - __builtin_clz(a)) /* only works for a != 0 */

#define NUM_OF_HIGHESTBITclz(a) ((a)                                \
                             ? (1U << POS_OF_HIGHESTBITclz(a))      \
                             : 0)


int main()
{
  unsigned ui;

  for (ui = 0U; ui < 18U; ++ui)
    printf("%i \t %i\n", ui, NUM_OF_HIGHESTBITclz(ui));

  return 0;
}

回复收藏 0 原文

淡淡の花香 2024-09-04 06:46:02

我来补充一张！

typedef unsigned long long u64;
typedef unsigned int       u32;
typedef unsigned char      u8;


u8 findMostSignificantBit (u64 u64Val)
{
  u8 u8Shift;
  u8 u8Bit = 0;

  assert (u64Val != 0ULL);

  for (u8Shift = 32 ; u8Shift != 0 ; u8Shift >>= 1)
  {
    u64 u64Temp = u64Val >> u8Shift;
    if (u64Temp)
    {
      u8Bit |= u8Shift; // notice not using +=
      u64Val = u64Temp;
    }
  }

  return u8Bit;
}

当然，这适用于 64 位数字（unsigned long long），而不是数组。另外，很多人指出了我不知道的内置 g++ 函数。多么有趣啊。

无论如何，这会在 6 次迭代中找到最高有效位，并且如果将 0 传递给函数，则会给出断言。如果您可以访问芯片组的指令，则不是最好使用的功能。

我还使用 |= 而不是 +=，因为它们始终是 2 的幂，并且 OR（通常）比加法更快。因为我只是将 2 的独特幂加在一起，所以我从来没有翻转过。

这是二分查找，这意味着它总是在 6 次迭代中找到结果。

再说一遍，这样更好：

u8 findMostSignificantBit2 (u64 u64Val)
{
  assert (u64Val != 0ULL);

  return (u8) (__builtin_ctzll(u64Val));
}

I'll add one!

typedef unsigned long long u64;
typedef unsigned int       u32;
typedef unsigned char      u8;


u8 findMostSignificantBit (u64 u64Val)
{
  u8 u8Shift;
  u8 u8Bit = 0;

  assert (u64Val != 0ULL);

  for (u8Shift = 32 ; u8Shift != 0 ; u8Shift >>= 1)
  {
    u64 u64Temp = u64Val >> u8Shift;
    if (u64Temp)
    {
      u8Bit |= u8Shift; // notice not using +=
      u64Val = u64Temp;
    }
  }

  return u8Bit;
}

Of course, this is working on a 64 bit number (unsigned long long), and not an array. Also, plenty of people have pointed to inbuilt g++ functions I was not aware of. How interesting.

Anyhow, this finds the most significant bit in 6 iterations and gives an assert if you passed 0 to the function. Not the best function to use if you have access to an instruction of the chipset.

I also am also using |= instead of += because these are always powers of two, and OR is (classically) faster than addition. Since I'm only adding unique powers of 2 together, I never have roll over.

This is a binary search which means it always finds the result in 6 iterations.

Again, this is better:

u8 findMostSignificantBit2 (u64 u64Val)
{
  assert (u64Val != 0ULL);

  return (u8) (__builtin_ctzll(u64Val));
}

回复收藏 0 原文

落叶缤纷 2024-09-04 06:46:02

这是一个针对任意大小的字节数组的简单强力算法：

int msb( unsigned char x);  // prototype for function that returns 
                            //  most significant bit set

unsigned char* p;

for (p = arr + num_elements; p != arr;) {
    --p;
    if (*p != 0) break;
}

// p is with pointing to the last byte that has a bit set, or
//  it's pointing to the first byte in the array

if (*p) {
    return ((p - arr) * 8) + msb( *p);
}

// what do you want to return if no bits are set?
return -1;

我将把它作为练习，让读者提出适当的 msb() 函数以及优化处理 int 或 long long 大小的数据缝隙。

Here's a simple, brute force algorithm for an arbitrary-sized array of bytes:

int msb( unsigned char x);  // prototype for function that returns 
                            //  most significant bit set

unsigned char* p;

for (p = arr + num_elements; p != arr;) {
    --p;
    if (*p != 0) break;
}

// p is with pointing to the last byte that has a bit set, or
//  it's pointing to the first byte in the array

if (*p) {
    return ((p - arr) * 8) + msb( *p);
}

// what do you want to return if no bits are set?
return -1;

I'll leave it as a an exercise for the reader to come up with an appropriate msb() function as well as the optimization to work on int or long long sized chinks of data.

回复收藏 0 原文

自演自醉 2024-09-04 06:46:02

嗯，您的标签指示 32 位，但看起来您使用的值是 16 位。如果你的意思是 32 位，那么我认为 0x00a1 的答案应该是 24 而不是 8。

假设你正在从左侧查找 MSB 位索引，并且你知道你只会处理 uint32_t，这里是显而易见、简单的算法：

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>

int main()
{
    uint32_t test_value = 0x00a1;
    int i;

    for (i=0; i<32; ++i)
    {
        if (test_value & (0x80000000 >> i))
        {
            printf("i = %d\n", i);
            exit(0);
        }
    }

    return 0;
}

Um, your tag indicates 32bit but it looks like the values that you're using are 16 bit. If you did mean 32 bit, then I think the answer for 0x00a1 ought to be 24 and not 8.

Assuming that you are looking for the MSB bit index from the left hand side and you know that you will only be dealing with uint32_t's, here's the obvious, simple-minded algorithm:

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>

int main()
{
    uint32_t test_value = 0x00a1;
    int i;

    for (i=0; i<32; ++i)
    {
        if (test_value & (0x80000000 >> i))
        {
            printf("i = %d\n", i);
            exit(0);
        }
    }

    return 0;
}

回复收藏 0 原文

—━☆沉默づ 2024-09-04 06:46:02

对于java我使用这个：

static public final int msb(int n) {
    n |= n >>> 1;  
    n |= n >>> 2; 
    n |= n >>> 4; 
    n |= n >>> 8; 
    n |= n >>> 16; 
    n >>>= 1;
    n += 1; 
    return n;
}

并且：

static public final int msb_index(int n) {

    final int[] multiply_de_bruijn_bit_position = {
        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 
        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
    };
    return multiply_de_bruijn_bit_position[(msb(n) * 0x077CB531) >>> 27];
}

For java I use this:

static public final int msb(int n) {
    n |= n >>> 1;  
    n |= n >>> 2; 
    n |= n >>> 4; 
    n |= n >>> 8; 
    n |= n >>> 16; 
    n >>>= 1;
    n += 1; 
    return n;
}

And:

static public final int msb_index(int n) {

    final int[] multiply_de_bruijn_bit_position = {
        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 
        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
    };
    return multiply_de_bruijn_bit_position[(msb(n) * 0x077CB531) >>> 27];
}

回复收藏 0 原文

别挽留 2024-09-04 06:46:02

#define FFS(t)  \
({ \
register int n = 0; \
            \ 
if (!(0xffff & t)) \
    n += 16; \
         \
if (!((0xff << n) & t)) \
    n += 8; \
        \
if (!((0xf << n) & t)) \
    n += 4; \
        \
if (!((0x3 << n) & t)) \
    n += 2; \
        \
if (!((0x1 << n) & t)) \
    n += 1; \
        \
n; \
})

#define FFS(t)  \
({ \
register int n = 0; \
            \ 
if (!(0xffff & t)) \
    n += 16; \
         \
if (!((0xff << n) & t)) \
    n += 8; \
        \
if (!((0xf << n) & t)) \
    n += 4; \
        \
if (!((0x3 << n) & t)) \
    n += 2; \
        \
if (!((0x1 << n) & t)) \
    n += 1; \
        \
n; \
})

回复收藏 0 原文

~没有更多了~

关于作者

暂无简介

0 文章

0 评论

24 人气

关注发私信

相关话题

热门标签

操作系统程序设计 IT运维 Linux系统管理 JavaScript 服务器应用 solaris C/C++ PHP Shell BSD Vue.js aix Oracle Python HTML 系统管理 HTML5 CSS 前端

推荐作者

游缘惊梦

文章 0 评论 0

小兔几

文章 0 评论 0

Glik

文章 0 评论 0

生生漫

文章 0 评论 0

Luxian

文章 0 评论 0

Champion-Ming

文章 0 评论 0

友情链接

我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的隐私政策了解更多相关信息。单击 接受 或继续使用网站，即表示您同意使用 Cookies 和您的相关数据。

原文