快速24位数组-> 32位数组转换？

发布于 2024-09-04 15:13:01 字数 558 浏览 10 评论 0原文

快速摘要：

我有一个 24 位值的数组。关于如何快速将各个 24 位数组元素扩展为 32 位元素，有什么建议吗？

详细信息：

我正在使用 DirectX 10 中的像素着色器实时处理传入的视频帧。一个绊脚石是我的帧来自 24 位捕获硬件像素（YUV 或 RGB 图像），但 DX10 采用 32 位像素纹理。因此，我必须将 24 位值扩展为 32 位，然后才能将它们加载到 GPU 中。

我真的不在乎我将剩余的 8 位设置为什么，或者传入的 24 位在该 32 位值中的位置 - 我可以在像素着色器中修复所有这些问题。但我需要非常快速地从 24 位转换为 32 位。

我对 SIMD SSE 操作不是很熟悉，但从我粗略的看去，我似乎无法使用它们进行扩展，因为我的读取和写入大小不同。有什么建议吗？或者我是否卡住了按顺序处理该数据集？

这感觉非常愚蠢 - 我正在使用像素着色器来实现并行性，但在此之前我必须执行顺序的每像素操作。我一定遗漏了一些明显的东西......

原文

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

〗斷ホ乔殘χμё〖 2024-09-11 15:13:01

下面的代码应该相当快。它在每次迭代中复制 4 个像素，仅使用 32 位读/写指令。源指针和目标指针应与 32 位对齐。

uint32_t *src = ...;
uint32_t *dst = ...;

for (int i=0; i<num_pixels; i+=4) {
    uint32_t sa = src[0];
    uint32_t sb = src[1];
    uint32_t sc = src[2];

    dst[i+0] = sa;
    dst[i+1] = (sa>>24) | (sb<<8);
    dst[i+2] = (sb>>16) | (sc<<16);
    dst[i+3] = sc>>8;

    src += 3;
}

编辑：

以下是使用 SSSE3 指令 PSHUFB 和 PALIGNR 执行此操作的方法。该代码是使用编译器内部函数编写的，但如果需要的话，将其转换为汇编应该不难。它在每次迭代中复制 16 个像素。源和目标指针必须对齐到16字节，否则会出错。如果它们未对齐，您可以通过将 _mm_load_si128 替换为 _mm_loadu_si128 并将 _mm_store_si128 替换为 _mm_storeu_si128 来使其工作，但这会比较慢。

#include <emmintrin.h>
#include <tmmintrin.h>

__m128i *src = ...;
__m128i *dst = ...;
__m128i mask = _mm_setr_epi8(0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1);

for (int i=0; i<num_pixels; i+=16) {
    __m128i sa = _mm_load_si128(src);
    __m128i sb = _mm_load_si128(src+1);
    __m128i sc = _mm_load_si128(src+2);

    __m128i val = _mm_shuffle_epi8(sa, mask);
    _mm_store_si128(dst, val);
    val = _mm_shuffle_epi8(_mm_alignr_epi8(sb, sa, 12), mask);
    _mm_store_si128(dst+1, val);
    val = _mm_shuffle_epi8(_mm_alignr_epi8(sc, sb, 8), mask);
    _mm_store_si128(dst+2, val);
    val = _mm_shuffle_epi8(_mm_alignr_epi8(sc, sc, 4), mask);
    _mm_store_si128(dst+3, val);

    src += 3;
    dst += 4;
}

SSSE3（不要与 SSE3 混淆）需要相对较新的处理器：Core 2 或更新版本，我相信 AMD 还不支持它。仅使用 SSE2 指令执行此操作将需要更多操作，并且可能不值得。

The code below should be pretty fast. It copies 4 pixels in each iteration, using only 32-bit read/write instructions. The source and destination pointers should be aligned to 32 bits.

uint32_t *src = ...;
uint32_t *dst = ...;

for (int i=0; i<num_pixels; i+=4) {
    uint32_t sa = src[0];
    uint32_t sb = src[1];
    uint32_t sc = src[2];

    dst[i+0] = sa;
    dst[i+1] = (sa>>24) | (sb<<8);
    dst[i+2] = (sb>>16) | (sc<<16);
    dst[i+3] = sc>>8;

    src += 3;
}

Edit:

Here is a way to do this using the SSSE3 instructions PSHUFB and PALIGNR. The code is written using compiler intrinsics, but it shouldn't be hard to translate to assembly if needed. It copies 16 pixels in each iteration. The source and destination pointers Must be aligned to 16 bytes, or it will fault. If they aren't aligned, you can make it work by replacing _mm_load_si128 with _mm_loadu_si128 and _mm_store_si128 with _mm_storeu_si128, but this will be slower.

#include <emmintrin.h>
#include <tmmintrin.h>

__m128i *src = ...;
__m128i *dst = ...;
__m128i mask = _mm_setr_epi8(0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1);

for (int i=0; i<num_pixels; i+=16) {
    __m128i sa = _mm_load_si128(src);
    __m128i sb = _mm_load_si128(src+1);
    __m128i sc = _mm_load_si128(src+2);

    __m128i val = _mm_shuffle_epi8(sa, mask);
    _mm_store_si128(dst, val);
    val = _mm_shuffle_epi8(_mm_alignr_epi8(sb, sa, 12), mask);
    _mm_store_si128(dst+1, val);
    val = _mm_shuffle_epi8(_mm_alignr_epi8(sc, sb, 8), mask);
    _mm_store_si128(dst+2, val);
    val = _mm_shuffle_epi8(_mm_alignr_epi8(sc, sc, 4), mask);
    _mm_store_si128(dst+3, val);

    src += 3;
    dst += 4;
}

SSSE3 (not to be confused with SSE3) will require a relatively new processor: Core 2 or newer, and I believe AMD doesn't support it yet. Performing this with SSE2 instructions only will take a lot more operations, and may not be worth it.

回复收藏 0 原文

千鲤 2024-09-11 15:13:01

SSE3 很棒，但对于那些因某种原因无法使用它的人来说，这里是 x86 汇编器中的转换，真正由您手工优化。为了完整起见，我给出了两个方向的转换：RGB32->RGB24 和 RGB24->RGB32。

请注意，interjay 的 C 代码在目标像素的 MSB（Alpha 通道）中留下了垃圾。这在某些应用程序中可能并不重要，但在我的应用程序中很重要，因此我的 RGB24->RGB32 代码强制 MSB 为零。同样，我的 RGB32->RGB24 代码忽略了 MSB；如果源数据具有非零 Alpha 通道，这可以避免垃圾输出。正如基准测试所验证的，这些功能在性能方面几乎没有任何成本。

对于 RGB32->RGB24，我能够击败 VC++ 优化器约 20%。对于 RGB24->RGB32，增益微不足道。基准测试是在 i5 2500K 上进行的。我在这里省略了基准测试代码，但如果有人需要，我会提供它。最重要的优化是尽快碰撞源指针（请参阅 ASAP 评论）。我最好的猜测是，这通过允许指令管道更快地预取来增加并行性。除此之外，我只是重新排序了一些指令，以减少依赖性并将内存访问与位攻击重叠。

void ConvRGB32ToRGB24(const UINT *Src, UINT *Dst, UINT Pixels)
{
#if !USE_ASM
    for (UINT i = 0; i < Pixels; i += 4) {
        UINT    sa = Src[i + 0] & 0xffffff;
        UINT    sb = Src[i + 1] & 0xffffff;
        UINT    sc = Src[i + 2] & 0xffffff;
        UINT    sd = Src[i + 3];
        Dst[0] = sa | (sb << 24);
        Dst[1] = (sb >> 8) | (sc << 16);
        Dst[2] = (sc >> 16) | (sd << 8);
        Dst += 3;
    }
#else
    __asm {
        mov     ecx, Pixels
        shr     ecx, 2              // 4 pixels at once
        jz      ConvRGB32ToRGB24_$2
        mov     esi, Src
        mov     edi, Dst
ConvRGB32ToRGB24_$1:
        mov     ebx, [esi + 4]      // sb
        and     ebx, 0ffffffh       // sb & 0xffffff
        mov     eax, [esi + 0]      // sa
        and     eax, 0ffffffh       // sa & 0xffffff
        mov     edx, ebx            // copy sb
        shl     ebx, 24             // sb << 24
        or      eax, ebx            // sa | (sb << 24)
        mov     [edi + 0], eax      // Dst[0]
        shr     edx, 8              // sb >> 8
        mov     eax, [esi + 8]      // sc
        and     eax, 0ffffffh       // sc & 0xffffff
        mov     ebx, eax            // copy sc
        shl     eax, 16             // sc << 16
        or      eax, edx            // (sb >> 8) | (sc << 16)
        mov     [edi + 4], eax      // Dst[1]
        shr     ebx, 16             // sc >> 16
        mov     eax, [esi + 12]     // sd
        add     esi, 16             // Src += 4 (ASAP)
        shl     eax, 8              // sd << 8
        or      eax, ebx            // (sc >> 16) | (sd << 8)
        mov     [edi + 8], eax      // Dst[2]
        add     edi, 12             // Dst += 3
        dec     ecx
        jnz     SHORT ConvRGB32ToRGB24_$1
ConvRGB32ToRGB24_$2:
    }
#endif
}

void ConvRGB24ToRGB32(const UINT *Src, UINT *Dst, UINT Pixels)
{
#if !USE_ASM
    for (UINT i = 0; i < Pixels; i += 4) {
        UINT    sa = Src[0];
        UINT    sb = Src[1];
        UINT    sc = Src[2];
        Dst[i + 0] = sa & 0xffffff;
        Dst[i + 1] = ((sa >> 24) | (sb << 8)) & 0xffffff;
        Dst[i + 2] = ((sb >> 16) | (sc << 16)) & 0xffffff;
        Dst[i + 3] = sc >> 8;
        Src += 3;
    }
#else
    __asm {
        mov     ecx, Pixels
        shr     ecx, 2              // 4 pixels at once
        jz      SHORT ConvRGB24ToRGB32_$2
        mov     esi, Src
        mov     edi, Dst
        push    ebp
ConvRGB24ToRGB32_$1:
        mov     ebx, [esi + 4]      // sb
        mov     edx, ebx            // copy sb
        mov     eax, [esi + 0]      // sa
        mov     ebp, eax            // copy sa
        and     ebx, 0ffffh         // sb & 0xffff
        shl     ebx, 8              // (sb & 0xffff) << 8
        and     eax, 0ffffffh       // sa & 0xffffff
        mov     [edi + 0], eax      // Dst[0]
        shr     ebp, 24             // sa >> 24
        or      ebx, ebp            // (sa >> 24) | ((sb & 0xffff) << 8)
        mov     [edi + 4], ebx      // Dst[1]
        shr     edx, 16             // sb >> 16
        mov     eax, [esi + 8]      // sc
        add     esi, 12             // Src += 12 (ASAP)
        mov     ebx, eax            // copy sc
        and     eax, 0ffh           // sc & 0xff
        shl     eax, 16             // (sc & 0xff) << 16
        or      eax, edx            // (sb >> 16) | ((sc & 0xff) << 16)
        mov     [edi + 8], eax      // Dst[2]
        shr     ebx, 8              // sc >> 8
        mov     [edi + 12], ebx     // Dst[3]
        add     edi, 16             // Dst += 16
        dec     ecx
        jnz     SHORT ConvRGB24ToRGB32_$1
        pop     ebp
ConvRGB24ToRGB32_$2:
    }
#endif
}

当我们这样做时，这里是实际 SSE3 汇编中的相同转换。仅当您有汇编器（FASM 是免费的）并且有支持 SSE3 的 CPU（可能但最好检查一下）时，这才有效。请注意，内在函数不一定会输出如此有效的结果，它完全取决于您使用的工具以及您要编译的平台。在这里，很简单：所见即所得。此代码生成与上面的 x86 代码相同的输出，并且速度大约快 1.5 倍（在 i5 2500K 上）。

format MS COFF

section '.text' code readable executable

public _ConvRGB32ToRGB24SSE3

;   ebp + 8     Src (*RGB32, 16-byte aligned)
;   ebp + 12    Dst (*RGB24, 16-byte aligned)
;   ebp + 16    Pixels

_ConvRGB32ToRGB24SSE3:
    push    ebp
    mov     ebp, esp
    mov     eax, [ebp + 8]
    mov     edx, [ebp + 12]
    mov     ecx, [ebp + 16]
    shr     ecx, 4
    jz      done1
    movupd  xmm7, [mask1]

top1:
    movupd  xmm0, [eax + 0]     ; sa = Src[0]
    pshufb  xmm0, xmm7          ; sa = _mm_shuffle_epi8(sa, mask)
    movupd  xmm1, [eax + 16]    ; sb = Src[1]
    pshufb  xmm1, xmm7          ; sb = _mm_shuffle_epi8(sb, mask)
    movupd  xmm2, xmm1          ; sb1 = sb
    pslldq  xmm1, 12            ; sb = _mm_slli_si128(sb, 12)
    por     xmm0, xmm1          ; sa = _mm_or_si128(sa, sb)
    movupd  [edx + 0], xmm0     ; Dst[0] = sa
    psrldq  xmm2, 4             ; sb1 = _mm_srli_si128(sb1, 4)
    movupd  xmm0, [eax + 32]    ; sc = Src[2]
    pshufb  xmm0, xmm7          ; sc = _mm_shuffle_epi8(sc, mask)
    movupd  xmm1, xmm0          ; sc1 = sc
    pslldq  xmm0, 8             ; sc = _mm_slli_si128(sc, 8)
    por     xmm0, xmm2          ; sc = _mm_or_si128(sb1, sc)
    movupd  [edx + 16], xmm0    ; Dst[1] = sc
    psrldq  xmm1, 8             ; sc1 = _mm_srli_si128(sc1, 8)
    movupd  xmm0, [eax + 48]    ; sd = Src[3]
    pshufb  xmm0, xmm7          ; sd = _mm_shuffle_epi8(sd, mask)
    pslldq  xmm0, 4             ; sd = _mm_slli_si128(sd, 4)
    por     xmm0, xmm1          ; sd = _mm_or_si128(sc1, sd)
    movupd  [edx + 32], xmm0    ; Dst[2] = sd
    add     eax, 64
    add     edx, 48
    dec     ecx
    jnz     top1

done1:
    pop     ebp
    ret

public _ConvRGB24ToRGB32SSE3

;   ebp + 8     Src (*RGB24, 16-byte aligned)
;   ebp + 12    Dst (*RGB32, 16-byte aligned)
;   ebp + 16    Pixels

_ConvRGB24ToRGB32SSE3:
    push    ebp
    mov     ebp, esp
    mov     eax, [ebp + 8]
    mov     edx, [ebp + 12]
    mov     ecx, [ebp + 16]
    shr     ecx, 4
    jz      done2
    movupd  xmm7, [mask2]

top2:
    movupd  xmm0, [eax + 0]     ; sa = Src[0]
    movupd  xmm1, [eax + 16]    ; sb = Src[1]
    movupd  xmm2, [eax + 32]    ; sc = Src[2]
    movupd  xmm3, xmm0          ; sa1 = sa
    pshufb  xmm0, xmm7          ; sa = _mm_shuffle_epi8(sa, mask)
    movupd  [edx], xmm0         ; Dst[0] = sa
    movupd  xmm4, xmm1          ; sb1 = sb
    palignr xmm1, xmm3, 12      ; sb = _mm_alignr_epi8(sb, sa1, 12)
    pshufb  xmm1, xmm7          ; sb = _mm_shuffle_epi8(sb, mask);
    movupd  [edx + 16], xmm1    ; Dst[1] = sb
    movupd  xmm3, xmm2          ; sc1 = sc
    palignr xmm2, xmm4, 8       ; sc = _mm_alignr_epi8(sc, sb1, 8)
    pshufb  xmm2, xmm7          ; sc = _mm_shuffle_epi8(sc, mask)
    movupd  [edx + 32], xmm2    ; Dst[2] = sc
    palignr xmm3, xmm3, 4       ; sc1 = _mm_alignr_epi8(sc1, sc1, 4)
    pshufb  xmm3, xmm7          ; sc1 = _mm_shuffle_epi8(sc1, mask)
    movupd  [edx + 48], xmm3    ; Dst[3] = sc1
    add     eax, 48
    add     edx, 64
    dec     ecx
    jnz     top2

done2:
    pop     ebp
    ret

section '.data' data readable writeable align 16

label mask1 dqword 
    db  0,1,2,4, 5,6,8,9, 10,12,13,14, -1,-1,-1,-1
label mask2 dqword 
    db  0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1

SSE3 is awesome, but for those who can't use it for whatever reason, here's the conversion in x86 assembler, hand-optimized by yours truly. For completeness, I give the conversion in both directions: RGB32->RGB24 and RGB24->RGB32.

Note that interjay's C code leaves trash in the MSB (the alpha channel) of the destination pixels. This might not matter in some applications, but it matters in mine, hence my RGB24->RGB32 code forces the MSB to zero. Similarly, my RGB32->RGB24 code ignores the MSB; this avoids garbage output if the source data has a non-zero alpha channel. These features cost almost nothing in terms of performance, as verified by benchmarks.

For RGB32->RGB24 I was able to beat the VC++ optimizer by about 20%. For RGB24->RGB32 the gain was insignificant. Benchmarking was done on an i5 2500K. I omit the benchmarking code here, but if anyone wants it I'll provide it. The most important optimization was bumping the source pointer as soon as possible (see the ASAP comment). My best guess is that this increases parallelism by allowing the instruction pipeline to prefetch sooner. Other than that I just reordered some instructions to reduce dependencies and overlap memory accesses with bit-bashing.

void ConvRGB32ToRGB24(const UINT *Src, UINT *Dst, UINT Pixels)
{
#if !USE_ASM
    for (UINT i = 0; i < Pixels; i += 4) {
        UINT    sa = Src[i + 0] & 0xffffff;
        UINT    sb = Src[i + 1] & 0xffffff;
        UINT    sc = Src[i + 2] & 0xffffff;
        UINT    sd = Src[i + 3];
        Dst[0] = sa | (sb << 24);
        Dst[1] = (sb >> 8) | (sc << 16);
        Dst[2] = (sc >> 16) | (sd << 8);
        Dst += 3;
    }
#else
    __asm {
        mov     ecx, Pixels
        shr     ecx, 2              // 4 pixels at once
        jz      ConvRGB32ToRGB24_$2
        mov     esi, Src
        mov     edi, Dst
ConvRGB32ToRGB24_$1:
        mov     ebx, [esi + 4]      // sb
        and     ebx, 0ffffffh       // sb & 0xffffff
        mov     eax, [esi + 0]      // sa
        and     eax, 0ffffffh       // sa & 0xffffff
        mov     edx, ebx            // copy sb
        shl     ebx, 24             // sb << 24
        or      eax, ebx            // sa | (sb << 24)
        mov     [edi + 0], eax      // Dst[0]
        shr     edx, 8              // sb >> 8
        mov     eax, [esi + 8]      // sc
        and     eax, 0ffffffh       // sc & 0xffffff
        mov     ebx, eax            // copy sc
        shl     eax, 16             // sc << 16
        or      eax, edx            // (sb >> 8) | (sc << 16)
        mov     [edi + 4], eax      // Dst[1]
        shr     ebx, 16             // sc >> 16
        mov     eax, [esi + 12]     // sd
        add     esi, 16             // Src += 4 (ASAP)
        shl     eax, 8              // sd << 8
        or      eax, ebx            // (sc >> 16) | (sd << 8)
        mov     [edi + 8], eax      // Dst[2]
        add     edi, 12             // Dst += 3
        dec     ecx
        jnz     SHORT ConvRGB32ToRGB24_$1
ConvRGB32ToRGB24_$2:
    }
#endif
}

void ConvRGB24ToRGB32(const UINT *Src, UINT *Dst, UINT Pixels)
{
#if !USE_ASM
    for (UINT i = 0; i < Pixels; i += 4) {
        UINT    sa = Src[0];
        UINT    sb = Src[1];
        UINT    sc = Src[2];
        Dst[i + 0] = sa & 0xffffff;
        Dst[i + 1] = ((sa >> 24) | (sb << 8)) & 0xffffff;
        Dst[i + 2] = ((sb >> 16) | (sc << 16)) & 0xffffff;
        Dst[i + 3] = sc >> 8;
        Src += 3;
    }
#else
    __asm {
        mov     ecx, Pixels
        shr     ecx, 2              // 4 pixels at once
        jz      SHORT ConvRGB24ToRGB32_$2
        mov     esi, Src
        mov     edi, Dst
        push    ebp
ConvRGB24ToRGB32_$1:
        mov     ebx, [esi + 4]      // sb
        mov     edx, ebx            // copy sb
        mov     eax, [esi + 0]      // sa
        mov     ebp, eax            // copy sa
        and     ebx, 0ffffh         // sb & 0xffff
        shl     ebx, 8              // (sb & 0xffff) << 8
        and     eax, 0ffffffh       // sa & 0xffffff
        mov     [edi + 0], eax      // Dst[0]
        shr     ebp, 24             // sa >> 24
        or      ebx, ebp            // (sa >> 24) | ((sb & 0xffff) << 8)
        mov     [edi + 4], ebx      // Dst[1]
        shr     edx, 16             // sb >> 16
        mov     eax, [esi + 8]      // sc
        add     esi, 12             // Src += 12 (ASAP)
        mov     ebx, eax            // copy sc
        and     eax, 0ffh           // sc & 0xff
        shl     eax, 16             // (sc & 0xff) << 16
        or      eax, edx            // (sb >> 16) | ((sc & 0xff) << 16)
        mov     [edi + 8], eax      // Dst[2]
        shr     ebx, 8              // sc >> 8
        mov     [edi + 12], ebx     // Dst[3]
        add     edi, 16             // Dst += 16
        dec     ecx
        jnz     SHORT ConvRGB24ToRGB32_$1
        pop     ebp
ConvRGB24ToRGB32_$2:
    }
#endif
}

And while we're at it, here are the same conversions in actual SSE3 assembly. This only works if you have an assembler (FASM is free) and have a CPU that supports SSE3 (likely but it's better to check). Note that the intrinsics don't necessarily output something this efficient, it totally depends on the tools you use and what platform you're compiling for. Here, it's straightforward: what you see is what you get. This code generates the same output as the x86 code above, and it's about 1.5x faster (on an i5 2500K).

format MS COFF

section '.text' code readable executable

public _ConvRGB32ToRGB24SSE3

;   ebp + 8     Src (*RGB32, 16-byte aligned)
;   ebp + 12    Dst (*RGB24, 16-byte aligned)
;   ebp + 16    Pixels

_ConvRGB32ToRGB24SSE3:
    push    ebp
    mov     ebp, esp
    mov     eax, [ebp + 8]
    mov     edx, [ebp + 12]
    mov     ecx, [ebp + 16]
    shr     ecx, 4
    jz      done1
    movupd  xmm7, [mask1]

top1:
    movupd  xmm0, [eax + 0]     ; sa = Src[0]
    pshufb  xmm0, xmm7          ; sa = _mm_shuffle_epi8(sa, mask)
    movupd  xmm1, [eax + 16]    ; sb = Src[1]
    pshufb  xmm1, xmm7          ; sb = _mm_shuffle_epi8(sb, mask)
    movupd  xmm2, xmm1          ; sb1 = sb
    pslldq  xmm1, 12            ; sb = _mm_slli_si128(sb, 12)
    por     xmm0, xmm1          ; sa = _mm_or_si128(sa, sb)
    movupd  [edx + 0], xmm0     ; Dst[0] = sa
    psrldq  xmm2, 4             ; sb1 = _mm_srli_si128(sb1, 4)
    movupd  xmm0, [eax + 32]    ; sc = Src[2]
    pshufb  xmm0, xmm7          ; sc = _mm_shuffle_epi8(sc, mask)
    movupd  xmm1, xmm0          ; sc1 = sc
    pslldq  xmm0, 8             ; sc = _mm_slli_si128(sc, 8)
    por     xmm0, xmm2          ; sc = _mm_or_si128(sb1, sc)
    movupd  [edx + 16], xmm0    ; Dst[1] = sc
    psrldq  xmm1, 8             ; sc1 = _mm_srli_si128(sc1, 8)
    movupd  xmm0, [eax + 48]    ; sd = Src[3]
    pshufb  xmm0, xmm7          ; sd = _mm_shuffle_epi8(sd, mask)
    pslldq  xmm0, 4             ; sd = _mm_slli_si128(sd, 4)
    por     xmm0, xmm1          ; sd = _mm_or_si128(sc1, sd)
    movupd  [edx + 32], xmm0    ; Dst[2] = sd
    add     eax, 64
    add     edx, 48
    dec     ecx
    jnz     top1

done1:
    pop     ebp
    ret

public _ConvRGB24ToRGB32SSE3

;   ebp + 8     Src (*RGB24, 16-byte aligned)
;   ebp + 12    Dst (*RGB32, 16-byte aligned)
;   ebp + 16    Pixels

_ConvRGB24ToRGB32SSE3:
    push    ebp
    mov     ebp, esp
    mov     eax, [ebp + 8]
    mov     edx, [ebp + 12]
    mov     ecx, [ebp + 16]
    shr     ecx, 4
    jz      done2
    movupd  xmm7, [mask2]

top2:
    movupd  xmm0, [eax + 0]     ; sa = Src[0]
    movupd  xmm1, [eax + 16]    ; sb = Src[1]
    movupd  xmm2, [eax + 32]    ; sc = Src[2]
    movupd  xmm3, xmm0          ; sa1 = sa
    pshufb  xmm0, xmm7          ; sa = _mm_shuffle_epi8(sa, mask)
    movupd  [edx], xmm0         ; Dst[0] = sa
    movupd  xmm4, xmm1          ; sb1 = sb
    palignr xmm1, xmm3, 12      ; sb = _mm_alignr_epi8(sb, sa1, 12)
    pshufb  xmm1, xmm7          ; sb = _mm_shuffle_epi8(sb, mask);
    movupd  [edx + 16], xmm1    ; Dst[1] = sb
    movupd  xmm3, xmm2          ; sc1 = sc
    palignr xmm2, xmm4, 8       ; sc = _mm_alignr_epi8(sc, sb1, 8)
    pshufb  xmm2, xmm7          ; sc = _mm_shuffle_epi8(sc, mask)
    movupd  [edx + 32], xmm2    ; Dst[2] = sc
    palignr xmm3, xmm3, 4       ; sc1 = _mm_alignr_epi8(sc1, sc1, 4)
    pshufb  xmm3, xmm7          ; sc1 = _mm_shuffle_epi8(sc1, mask)
    movupd  [edx + 48], xmm3    ; Dst[3] = sc1
    add     eax, 48
    add     edx, 64
    dec     ecx
    jnz     top2

done2:
    pop     ebp
    ret

section '.data' data readable writeable align 16

label mask1 dqword 
    db  0,1,2,4, 5,6,8,9, 10,12,13,14, -1,-1,-1,-1
label mask2 dqword 
    db  0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1

回复收藏 0 原文

墨小墨 2024-09-11 15:13:01

不同的输入/输出大小并不是使用 simd 的障碍，只是一个减速带。您需要对数据进行分块，以便以完整的 simd 字（16 字节）进行读写。

在这种情况下，您将读取 3 个 SIMD 字（48 字节 == 16 rgb 像素），进行扩展，然后写入 4 个 SIMD 字。

我只是说您可以使用SIMD，而不是说您应该。中间的部分，即扩展，仍然很棘手，因为单词的不同部分的移位大小不一致。

回复收藏 0 原文

过期以后 2024-09-11 15:13:01

SSE 4.1 .ASM：

PINSRD  XMM0,  DWORD PTR[ESI],   0
PINSRD  XMM0,  DWORD PTR[ESI+3], 1
PINSRD  XMM0,  DWORD PTR[ESI+6], 2
PINSRD  XMM0,  DWORD PTR[ESI+9], 3
PSLLD   XMM0,  8                    
PSRLD   XMM0,  8
MOVNTDQ [EDI], XMM1
add     ESI,   12
add     EDI,   16

SSE 4.1 .ASM:

PINSRD  XMM0,  DWORD PTR[ESI],   0
PINSRD  XMM0,  DWORD PTR[ESI+3], 1
PINSRD  XMM0,  DWORD PTR[ESI+6], 2
PINSRD  XMM0,  DWORD PTR[ESI+9], 3
PSLLD   XMM0,  8                    
PSRLD   XMM0,  8
MOVNTDQ [EDI], XMM1
add     ESI,   12
add     EDI,   16

回复收藏 0 原文

~没有更多了~