如何利用SIMD功能来使RGBA像素的8位组件之间的平方差异总和？

发布于 2025-01-26 17:37:27 字数 1545 浏览 2 评论 0原文

以下代码试图提取像素值的红色，绿色和蓝色通道，并用另一组RGB值执行算术。看来代码在逻辑周围试图执行平方和添加的逻辑很慢。

可以用更快的版本替换它的可能性，因为此逻辑似乎根本没有使用SIMD功能。

typedef struct {
        unsigned char b, g, r, a;
    } pixel;

            register pixel *pPixel;
            register int i, red1, green1, blue1, alpha1;
            register int red2, green2, blue2, alpha2;
            register long oldD, newD;

            red1 = GetRed( *pPixel );
            green1 = GetGreen( *pPixel );
            blue1 = GetBlue( *pPixel );
            alpha1 = GetAlpha( *pPixel );
            oldD = 2000000000;
            for ( i = 0; i < newcolors; ++i ) {
                red2 = GetRed( mycolormap[i].acolor );
                green2 = GetGreen( mycolormap[i].acolor );
                blue2 = GetBlue( mycolormap[i].acolor );
                alpha2 = GetAlpha( mycolormap[i].acolor );

                newD = ( red1 - red2 ) * ( red1 - red2 ) +  
                          ( green1 - green2 ) * ( green1 - green2 ) +
                          ( blue1 - blue2 ) * ( blue1 - blue2 ) +
                          ( alpha1 - alpha2 ) * ( alpha1 - alpha2 );
                if ( newD < oldD ) {
                    oldD = newD;
                }
            }

下面的代码部分似乎需要改进

  newD = ( red1 - red2 ) * ( red1 - red2 ) +  
                              ( green1 - green2 ) * ( green1 - green2 ) +
                              ( blue1 - blue2 ) * ( blue1 - blue2 ) +
                              ( alpha1 - alpha2 ) * ( alpha1 - alpha2 );

原文

The below code is trying to extract the red, green and blue channel of a pixel value and performing an arithmetic with another set of RGB values.
It seems that code is slow around the logic where its trying to perform the squaring and addition.

What would be the possibility to replace it with a faster version as this logic doesn't seems to be using SIMD capabilities at all.

typedef struct {
        unsigned char b, g, r, a;
    } pixel;

            register pixel *pPixel;
            register int i, red1, green1, blue1, alpha1;
            register int red2, green2, blue2, alpha2;
            register long oldD, newD;

            red1 = GetRed( *pPixel );
            green1 = GetGreen( *pPixel );
            blue1 = GetBlue( *pPixel );
            alpha1 = GetAlpha( *pPixel );
            oldD = 2000000000;
            for ( i = 0; i < newcolors; ++i ) {
                red2 = GetRed( mycolormap[i].acolor );
                green2 = GetGreen( mycolormap[i].acolor );
                blue2 = GetBlue( mycolormap[i].acolor );
                alpha2 = GetAlpha( mycolormap[i].acolor );

                newD = ( red1 - red2 ) * ( red1 - red2 ) +  
                          ( green1 - green2 ) * ( green1 - green2 ) +
                          ( blue1 - blue2 ) * ( blue1 - blue2 ) +
                          ( alpha1 - alpha2 ) * ( alpha1 - alpha2 );
                if ( newD < oldD ) {
                    oldD = newD;
                }
            }

Below section of code seems to be requiring improvement

  newD = ( red1 - red2 ) * ( red1 - red2 ) +  
                              ( green1 - green2 ) * ( green1 - green2 ) +
                              ( blue1 - blue2 ) * ( blue1 - blue2 ) +
                              ( alpha1 - alpha2 ) * ( alpha1 - alpha2 );

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

開玄 2025-02-02 17:37:27

这比看起来要困难。不幸的是，对于您来说，C ++编译器中的自动矢量化器很少在整数算术上做得很好，就像您在那里一样。

以下实现只需要SSE4.1。如果您有可能通过将所有这些向量升级到32字节来实质上改进，但是这将使几件事，剩余和最终减少变得复杂。

我不仅认为您想要最小点产品，还要是像素的索引。如果仅想要最小点产品，请删除BestIndices字段和处理该字段的代码。

struct alignas( 4 ) Pixel
{
    uint8_t b, g, r, a;
};

// Define __SSE4_1__ macro when building with MSVC for AVX1 or newer ISA
#if defined( _MSC_VER ) && defined( __AVX__ ) && !defined( __SSE4_1__ )
#define __SSE4_1__ 1
#endif

size_t findClosestPixel( const Pixel& ref, const Pixel* rsi, size_t length, int& bestValue )
{
    if( 0 == length )
    {
        bestValue = INT_MAX;
        return ~(size_t)0;
    }

    class Acc
    {
        // The reference pixel we're after, broadcasted and split into low/high pieces in 16-bit lanes
        __m128i lowRef, highRef;
        // The best dot product so far
        __m128i bestSquares = _mm_set1_epi32( INT_MAX );
        // Index of the pixels currently in bestSquares
        __m128i bestIndices = _mm_set1_epi32( -1 );

        const __m128i lowMask = _mm_set1_epi16( 0xFF );

        // For lanes where dp < bestSquares, update bestSquares and bestIndices vectors
        void updateFields( __m128i dp, __m128i indices )
        {
            const __m128i lt = _mm_cmplt_epi32( dp, bestSquares );
#ifndef __SSE4_1__
            bestSquares = _mm_or_si128( _mm_and_si128( lt, dp ), _mm_andnot_si128( lt, bestSquares ) );
            bestIndices = _mm_or_si128( _mm_and_si128( lt, indices ), _mm_andnot_si128( lt, bestIndices ) );
#else
            bestSquares = _mm_min_epi32( dp, bestSquares );
            bestIndices = _mm_blendv_epi8( bestIndices, indices, lt );
#endif
        }

    public:

        Acc( const Pixel& ref )
        {
            __m128i tmp = _mm_set1_epi32( *(const int*)( &ref ) );
            lowRef = _mm_and_si128( tmp, lowMask );
            highRef = _mm_srli_epi16( tmp, 8 );
        }

        // Update the accumulator with another 4 pixels
        void update( __m128i pixels, __m128i indices )
        {
            // Split into two vectors with 16-bit lanes:
            // low contains blue and red channels, high contains green and alpha
            __m128i low = _mm_and_si128( pixels, lowMask );
            __m128i high = _mm_srli_epi16( pixels, 8 );

            // Compute difference with the reference value we're after
            low = _mm_sub_epi16( low, lowRef );
            high = _mm_sub_epi16( high, highRef );

            // Compute squares as 32-bit numbers, add adjacent pairs
            low = _mm_madd_epi16( low, low );
            high = _mm_madd_epi16( high, high );
            // Adding them results in the dot product (sum of squares) for all 4 channels
            __m128i dp = _mm_add_epi32( low, high );

            // Update the state
            updateFields( dp, indices );
        }

        // Compute horizontal minimum across lanes in these accumulators
        uint32_t reduce( int& bestDp )
        {
            // Swap low/high halves
            __m128i s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 1, 0, 3, 2 ) );
            __m128i i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 1, 0, 3, 2 ) );
            updateFields( s2, i2 );

            // Swap even/odd lanes
            s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 2, 3, 0, 1 ) );
            i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 2, 3, 0, 1 ) );
            updateFields( s2, i2 );

            // Return lowest lanes from both vectors
            bestDp = _mm_cvtsi128_si32( bestSquares );
            return (uint32_t)_mm_cvtsi128_si32( bestIndices );
        }
    };

    Acc impl{ ref };

    const size_t lengthAligned = ( length / 4 ) * 4;
    size_t i;
    __m128i currentIndices = _mm_setr_epi32( 0, 1, 2, 3 );
    for( i = 0; i < lengthAligned; i += 4 )
    {
        // Load 4 source pixels
        __m128i src = _mm_loadu_si128( ( const __m128i* )( rsi + i ) );
        // Update things
        impl.update( src, currentIndices );
        // Increment index vector by 4 pixels
        currentIndices = _mm_add_epi32( currentIndices, _mm_set1_epi32( 4 ) );
    }

    const size_t remainder = length % 4;
    if( remainder == 0 )
    {
        // The input was a multiple of 4 pixels
        return impl.reduce( bestValue );
    }

    const int* const pi = (const int*)( rsi + i );
    __m128i rv;
    if( lengthAligned > 0 )
    {
        // We had at least 4 elements on input, can do unaligned load with negative offset
        size_t offset = 4 - remainder;
        currentIndices = _mm_sub_epi32( currentIndices, _mm_set1_epi32( (int)offset ) );
        rv = _mm_loadu_si128( ( const __m128i* )( pi - offset ) );
    }
    else
    {
        // Less than 4 elements on input, doing partial load and broadcasting the last element
        const size_t remainder = length % 4;
        switch( remainder )
        {
        case 1:
            rv = _mm_set1_epi32( pi[ 0 ] );
            break;
        case 2:
            rv = _mm_loadl_epi64( ( const __m128i* )pi );
            rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 1, 1, 1, 0 ) );
            break;
        case 3:
            rv = _mm_loadl_epi64( ( const __m128i* )pi );
#ifndef __SSE4_1__
            rv = _mm_unpacklo_epi64( rv, _mm_set1_epi32( pi[ 2 ] ) );
#else
            rv = _mm_insert_epi32( rv, pi[ 2 ], 2 );
            rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 2, 2, 1, 0 ) );
#endif
            break;
        }
    }

    impl.update( rv, currentIndices );
    return impl.reduce( bestValue );
}

It’s harder than it seems. Unfortunately for you, automatic vectorizers in C++ compilers are very rarely doing a good job for integer arithmetic, like you have there.

The following implementation only needs SSE4.1. If you have AVX2 possible to improve substantially by upgrading all these vectors to 32-byte ones, however this will complicate a couple things, remainder and final reduction.

I assumed not only you want the minimum dot product, also the index of the pixel. If you only want the minimum dot product, remove bestIndices field and the code which handles that field.

struct alignas( 4 ) Pixel
{
    uint8_t b, g, r, a;
};

// Define __SSE4_1__ macro when building with MSVC for AVX1 or newer ISA
#if defined( _MSC_VER ) && defined( __AVX__ ) && !defined( __SSE4_1__ )
#define __SSE4_1__ 1
#endif

size_t findClosestPixel( const Pixel& ref, const Pixel* rsi, size_t length, int& bestValue )
{
    if( 0 == length )
    {
        bestValue = INT_MAX;
        return ~(size_t)0;
    }

    class Acc
    {
        // The reference pixel we're after, broadcasted and split into low/high pieces in 16-bit lanes
        __m128i lowRef, highRef;
        // The best dot product so far
        __m128i bestSquares = _mm_set1_epi32( INT_MAX );
        // Index of the pixels currently in bestSquares
        __m128i bestIndices = _mm_set1_epi32( -1 );

        const __m128i lowMask = _mm_set1_epi16( 0xFF );

        // For lanes where dp < bestSquares, update bestSquares and bestIndices vectors
        void updateFields( __m128i dp, __m128i indices )
        {
            const __m128i lt = _mm_cmplt_epi32( dp, bestSquares );
#ifndef __SSE4_1__
            bestSquares = _mm_or_si128( _mm_and_si128( lt, dp ), _mm_andnot_si128( lt, bestSquares ) );
            bestIndices = _mm_or_si128( _mm_and_si128( lt, indices ), _mm_andnot_si128( lt, bestIndices ) );
#else
            bestSquares = _mm_min_epi32( dp, bestSquares );
            bestIndices = _mm_blendv_epi8( bestIndices, indices, lt );
#endif
        }

    public:

        Acc( const Pixel& ref )
        {
            __m128i tmp = _mm_set1_epi32( *(const int*)( &ref ) );
            lowRef = _mm_and_si128( tmp, lowMask );
            highRef = _mm_srli_epi16( tmp, 8 );
        }

        // Update the accumulator with another 4 pixels
        void update( __m128i pixels, __m128i indices )
        {
            // Split into two vectors with 16-bit lanes:
            // low contains blue and red channels, high contains green and alpha
            __m128i low = _mm_and_si128( pixels, lowMask );
            __m128i high = _mm_srli_epi16( pixels, 8 );

            // Compute difference with the reference value we're after
            low = _mm_sub_epi16( low, lowRef );
            high = _mm_sub_epi16( high, highRef );

            // Compute squares as 32-bit numbers, add adjacent pairs
            low = _mm_madd_epi16( low, low );
            high = _mm_madd_epi16( high, high );
            // Adding them results in the dot product (sum of squares) for all 4 channels
            __m128i dp = _mm_add_epi32( low, high );

            // Update the state
            updateFields( dp, indices );
        }

        // Compute horizontal minimum across lanes in these accumulators
        uint32_t reduce( int& bestDp )
        {
            // Swap low/high halves
            __m128i s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 1, 0, 3, 2 ) );
            __m128i i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 1, 0, 3, 2 ) );
            updateFields( s2, i2 );

            // Swap even/odd lanes
            s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 2, 3, 0, 1 ) );
            i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 2, 3, 0, 1 ) );
            updateFields( s2, i2 );

            // Return lowest lanes from both vectors
            bestDp = _mm_cvtsi128_si32( bestSquares );
            return (uint32_t)_mm_cvtsi128_si32( bestIndices );
        }
    };

    Acc impl{ ref };

    const size_t lengthAligned = ( length / 4 ) * 4;
    size_t i;
    __m128i currentIndices = _mm_setr_epi32( 0, 1, 2, 3 );
    for( i = 0; i < lengthAligned; i += 4 )
    {
        // Load 4 source pixels
        __m128i src = _mm_loadu_si128( ( const __m128i* )( rsi + i ) );
        // Update things
        impl.update( src, currentIndices );
        // Increment index vector by 4 pixels
        currentIndices = _mm_add_epi32( currentIndices, _mm_set1_epi32( 4 ) );
    }

    const size_t remainder = length % 4;
    if( remainder == 0 )
    {
        // The input was a multiple of 4 pixels
        return impl.reduce( bestValue );
    }

    const int* const pi = (const int*)( rsi + i );
    __m128i rv;
    if( lengthAligned > 0 )
    {
        // We had at least 4 elements on input, can do unaligned load with negative offset
        size_t offset = 4 - remainder;
        currentIndices = _mm_sub_epi32( currentIndices, _mm_set1_epi32( (int)offset ) );
        rv = _mm_loadu_si128( ( const __m128i* )( pi - offset ) );
    }
    else
    {
        // Less than 4 elements on input, doing partial load and broadcasting the last element
        const size_t remainder = length % 4;
        switch( remainder )
        {
        case 1:
            rv = _mm_set1_epi32( pi[ 0 ] );
            break;
        case 2:
            rv = _mm_loadl_epi64( ( const __m128i* )pi );
            rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 1, 1, 1, 0 ) );
            break;
        case 3:
            rv = _mm_loadl_epi64( ( const __m128i* )pi );
#ifndef __SSE4_1__
            rv = _mm_unpacklo_epi64( rv, _mm_set1_epi32( pi[ 2 ] ) );
#else
            rv = _mm_insert_epi32( rv, pi[ 2 ], 2 );
            rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 2, 2, 1, 0 ) );
#endif
            break;
        }
    }

    impl.update( rv, currentIndices );
    return impl.reduce( bestValue );
}

回复收藏 0 原文

~没有更多了~