使用 NEON 优化 RGBA8888 到 RGB565 的转换

发布于 2024-12-09 08:42:48 字数 3524 浏览 3 评论 0原文

我正在尝试使用 NEON 矢量指令集优化 iOS 上的图像格式转换。我认为这可以很好地映射到这一点，因为它处理一堆类似的数据。

不过，我的尝试并没有那么顺利，与天真的 c 实现相比，仅实现了边际加速：

for(int i = 0; i < pixelCount; ++i, ++inPixel32) {
    const unsigned int r = ((*inPixel32 >> 0 ) & 0xFF);
    const unsigned int g = ((*inPixel32 >> 8 ) & 0xFF);
    const unsigned int b = ((*inPixel32 >> 16) & 0xFF);
    *outPixel16++ = ((r >> 3) << 11) | ((g >> 2) << 5) | ((b >> 3) << 0);
}

iPad 2 上的 1 兆像素图像阵列：

格式为 [min avg max n=计时器样本数]（以毫秒为单位）

C： [14.446 14.632 18.405 n=1000]ms

氖： [11.920 12.032 15.336 n=1000]ms

我对 NEON 实现的尝试如下：

    int i;
const int pixelsPerLoop = 8;
for(i = 0; i < pixelCount; i += pixelsPerLoop, inPixel32 += pixelsPerLoop, outPixel16 += pixelsPerLoop) {
    //Read all r,g,b pixels into 3 registers
    uint8x8x4_t rgba  = vld4_u8(inPixel32);
    //Right-shift r,g,b as appropriate
    uint8x8_t r = vshr_n_u8(rgba.val[0], 3);
    uint8x8_t g = vshr_n_u8(rgba.val[1], 2);
    uint8x8_t b = vshr_n_u8(rgba.val[2], 3);

    //Widen b
    uint16x8_t r5_g6_b5 = vmovl_u8(b);
    //Widen r
    uint16x8_t r16 = vmovl_u8(r);
    //Left shift into position within 16-bit int
    r16 = vshlq_n_u16(r16, 11);
    r5_g6_b5 |= r16;

    //Widen g
    uint16x8_t g16 = vmovl_u8(g);
    //Left shift into position within 16-bit int
    g16 = vshlq_n_u16(g16, 5);

    r5_g6_b5 |= g16;

    //Now write back to memory
    vst1q_u16(outPixel16, r5_g6_b5);        
}
//Do the remainder on normal flt hardware

代码通过 LLVM 3.0 编译为以下内容（.loc 和额外标签已删除）：

_DNConvert_ARGB8888toRGB565:
    push    {r4, r5, r7, lr}
    mov r9, r1
    mov.w   r12, #0
    add r7, sp, #8
    cmp r2, #0
    mov.w   r1, #0
    it  ne
    movne   r1, #1
    cmp r0, #0
    mov.w   r3, #0
    it  ne
    movne   r3, #1
    cmp.w   r9, #0
    mov.w   r4, #0
    it  ne
    movne   r4, #1
    tst.w   r9, #3
    bne LBB0_8
    ands    r1, r3
    ands    r1, r4
    cmp r1, #1
    bne LBB0_8
    movs    r1, #0
    lsr.w   lr, r9, #2
    cmp.w   r1, r9, lsr #2
    bne LBB0_9
    mov r3, r2
    mov r5, r0
    b   LBB0_5
LBB0_4:
    movw    r1, #65528
    add.w   r0, lr, #7
    movt    r1, #32767
    ands    r1, r0
LBB0_5:
    mov.w   r12, #1
    cmp r1, lr
    bhs LBB0_8
    rsb r0, r1, r9, lsr #2
    mov.w   r9, #63488
    mov.w   lr, #2016
    mov.w   r12, #1
LBB0_7:
    ldr r2, [r5], #4
    subs    r0, #1
    and.w   r1, r9, r2, lsl #8
    and.w   r4, lr, r2, lsr #5
    ubfx    r2, r2, #19, #5
    orr.w   r2, r2, r4
    orr.w   r1, r1, r2
    strh    r1, [r3], #2
    bne LBB0_7
LBB0_8:
    mov r0, r12
    pop {r4, r5, r7, pc}
LBB0_9:
    sub.w   r1, lr, #1
    movs    r3, #32
    add.w   r3, r3, r1, lsl #2
    bic r3, r3, #31
    adds    r5, r0, r3
    movs    r3, #16
    add.w   r1, r3, r1, lsl #1
    bic r1, r1, #15
    adds    r3, r2, r1
    movs    r1, #0
LBB0_10:
    vld4.8  {d16, d17, d18, d19}, [r0]!
    adds    r1, #8
    cmp r1, lr
    vshr.u8 d20, d16, #3
    vshr.u8 d21, d17, #2
    vshr.u8 d16, d18, #3
    vmovl.u8    q11, d20
    vmovl.u8    q9, d21
    vmovl.u8    q8, d16
    vshl.i16    q10, q11, #11
    vshl.i16    q9, q9, #5
    vorr    q8, q8, q10
    vorr    q8, q8, q9
    vst1.16 {d16, d17}, [r2]!
Ltmp28:
    blo LBB0_10
    b   LBB0_4

完整代码可在 https://github.com/darknoon/DNImageConvert 我将不胜感激任何帮助，谢谢！

原文

I'm trying to optimize an image format conversion on iOS using the NEON vector instruction set. I assumed this would map well to that because it processes a bunch of similar data.

My attempts haven't gone that well, though, achieving only a marginal speedup vs the naive c implementation:

for(int i = 0; i < pixelCount; ++i, ++inPixel32) {
    const unsigned int r = ((*inPixel32 >> 0 ) & 0xFF);
    const unsigned int g = ((*inPixel32 >> 8 ) & 0xFF);
    const unsigned int b = ((*inPixel32 >> 16) & 0xFF);
    *outPixel16++ = ((r >> 3) << 11) | ((g >> 2) << 5) | ((b >> 3) << 0);
}

1 megapixel image array on iPad 2:

format is [min avg max n=number of timer samples] in milliseconds

C:
[14.446 14.632 18.405 n=1000]ms

NEON:
[11.920 12.032 15.336 n=1000]ms

My attempt at a NEON implementation is below:

    int i;
const int pixelsPerLoop = 8;
for(i = 0; i < pixelCount; i += pixelsPerLoop, inPixel32 += pixelsPerLoop, outPixel16 += pixelsPerLoop) {
    //Read all r,g,b pixels into 3 registers
    uint8x8x4_t rgba  = vld4_u8(inPixel32);
    //Right-shift r,g,b as appropriate
    uint8x8_t r = vshr_n_u8(rgba.val[0], 3);
    uint8x8_t g = vshr_n_u8(rgba.val[1], 2);
    uint8x8_t b = vshr_n_u8(rgba.val[2], 3);

    //Widen b
    uint16x8_t r5_g6_b5 = vmovl_u8(b);
    //Widen r
    uint16x8_t r16 = vmovl_u8(r);
    //Left shift into position within 16-bit int
    r16 = vshlq_n_u16(r16, 11);
    r5_g6_b5 |= r16;

    //Widen g
    uint16x8_t g16 = vmovl_u8(g);
    //Left shift into position within 16-bit int
    g16 = vshlq_n_u16(g16, 5);

    r5_g6_b5 |= g16;

    //Now write back to memory
    vst1q_u16(outPixel16, r5_g6_b5);        
}
//Do the remainder on normal flt hardware

Code was compiled via LLVM 3.0 into the following (.loc and extra labels removed):

_DNConvert_ARGB8888toRGB565:
    push    {r4, r5, r7, lr}
    mov r9, r1
    mov.w   r12, #0
    add r7, sp, #8
    cmp r2, #0
    mov.w   r1, #0
    it  ne
    movne   r1, #1
    cmp r0, #0
    mov.w   r3, #0
    it  ne
    movne   r3, #1
    cmp.w   r9, #0
    mov.w   r4, #0
    it  ne
    movne   r4, #1
    tst.w   r9, #3
    bne LBB0_8
    ands    r1, r3
    ands    r1, r4
    cmp r1, #1
    bne LBB0_8
    movs    r1, #0
    lsr.w   lr, r9, #2
    cmp.w   r1, r9, lsr #2
    bne LBB0_9
    mov r3, r2
    mov r5, r0
    b   LBB0_5
LBB0_4:
    movw    r1, #65528
    add.w   r0, lr, #7
    movt    r1, #32767
    ands    r1, r0
LBB0_5:
    mov.w   r12, #1
    cmp r1, lr
    bhs LBB0_8
    rsb r0, r1, r9, lsr #2
    mov.w   r9, #63488
    mov.w   lr, #2016
    mov.w   r12, #1
LBB0_7:
    ldr r2, [r5], #4
    subs    r0, #1
    and.w   r1, r9, r2, lsl #8
    and.w   r4, lr, r2, lsr #5
    ubfx    r2, r2, #19, #5
    orr.w   r2, r2, r4
    orr.w   r1, r1, r2
    strh    r1, [r3], #2
    bne LBB0_7
LBB0_8:
    mov r0, r12
    pop {r4, r5, r7, pc}
LBB0_9:
    sub.w   r1, lr, #1
    movs    r3, #32
    add.w   r3, r3, r1, lsl #2
    bic r3, r3, #31
    adds    r5, r0, r3
    movs    r3, #16
    add.w   r1, r3, r1, lsl #1
    bic r1, r1, #15
    adds    r3, r2, r1
    movs    r1, #0
LBB0_10:
    vld4.8  {d16, d17, d18, d19}, [r0]!
    adds    r1, #8
    cmp r1, lr
    vshr.u8 d20, d16, #3
    vshr.u8 d21, d17, #2
    vshr.u8 d16, d18, #3
    vmovl.u8    q11, d20
    vmovl.u8    q9, d21
    vmovl.u8    q8, d16
    vshl.i16    q10, q11, #11
    vshl.i16    q9, q9, #5
    vorr    q8, q8, q10
    vorr    q8, q8, q9
    vst1.16 {d16, d17}, [r2]!
Ltmp28:
    blo LBB0_10
    b   LBB0_4

Full code is available at https://github.com/darknoon/DNImageConvert I would appreciate any help, thanks!

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

深者入戏 2024-12-16 08:42:48

在这里，手工优化的 NEON 实现已准备好用于 XCode：

/* IT DOESN'T WORK!!! USE THE NEXT VERSION BELOW.
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19
rg          .req    red
gb          .req    blu

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8
    vshr.u8 red, red, #3
    vext.8  rg, grn, red, #5
    vshr.u8 grn, grn, #2
    vext.8  gb, blu, grn, #3
    vst2.8  {gb, rg}, [pDst]!
    bgt     loop

    bx      lr

此版本将比您建议的速度快很多倍：

通过 PLD 提高缓存命中率
转换为“ long” 没有必要
循环内的指令较少

尽管仍有一些优化空间，您可以修改循环以使其转换16像素每次迭代而不是 8 次。
然后，您可以安排指令来完全避免两次停顿（这在上面的 8/迭代版本中根本不可能实现），并另外受益于 NEON 的双发出功能。

我没有这样做，因为这会使代码难以理解。

了解 VEXT 的作用非常重要。

现在就看你了。 :)

我验证了这段代码可以在 Xcode 下正确编译。
虽然我很确定它也能正常工作，但我不能保证这一点，因为我没有测试环境。
如果出现故障，请告诉我。那么我会相应地纠正它。

cya

===================================================== ===============================

好吧，这是改进的版本。

由于 VSRI 指令的性质不允许除目标之外的两个操作数，因此不可能创建一个关于寄存器分配的更稳健的操作数。

请检查源图像的图像格式。（元素的精确字节顺序）

如果不是 B、G、R、A（iOS 上默认的本机字节顺序），您的应用程序将严重受到 iOS 内部转换的影响。

如果无论出于何种原因绝对不可能更改此设置，请告诉我。
我会写一个与之匹配的新版本。

PS：我忘记删除函数原型开头的下划线。现在它消失了。

/*
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 *
 * Version 1.1
 * - bug fix
 *
 * Version 1.0
 * - initial release
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19

gb          .req    grn
rg          .req    red

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

.loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8

    vsri.8  red, grn, #5
    vshl.u8 gb, grn, #3
    vsri.8  gb, blu, #3

    vst2.8  {gb, rg}, [pDst]!
    bgt     .loop

    bx      lr

Here you are, hand-optimized NEON implementation ready for XCode :

/* IT DOESN'T WORK!!! USE THE NEXT VERSION BELOW.
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19
rg          .req    red
gb          .req    blu

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8
    vshr.u8 red, red, #3
    vext.8  rg, grn, red, #5
    vshr.u8 grn, grn, #2
    vext.8  gb, blu, grn, #3
    vst2.8  {gb, rg}, [pDst]!
    bgt     loop

    bx      lr

This version will be many times faster than what you suggested :

increased cache hit rate via PLD
conversion to "long" not necessary
fewer instructions within the loop

There is still some room for optimizations though, you could modify the loop so that it converts 16 pixels per iteration instead of 8.
Then you can schedule the instructions to avoid the two stalls completely (which is simply not possible in this 8/iteration version above) and benefit from NEON's dual-issue capability in addition.

I didn't do this because it would make the code hard to understand.

It's important to know what VEXT is supposed to do.

Now it's up to you. :)

I verified this code to be properly compiled under Xcode.
Although I'm pretty sure it works correctly as well, I cannot guarantee this since I don't have the test environment.
In case of malfunctioning, please let me know. I'll correct it accordingly then.

cya

==============================================================================

Well, here is the improved version.

Due to the nature of the VSRI instruction not allowing two operands other than the target, it was not possible to create a more robust one regarding the register assignment.

Please check the image format of your source image. (exact byte order of the elements)

If it's not B, G, R, A, which is the default and native one on iOS, your application will suffer heavily from internal conversions by iOS.

If it's absolutely not possible to change this for whatever the reason, let me know.
I'll write a new version matching it.

PS : I forgot to remove the underscore at the start of the function prototype. Now it's gone.

/*
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 *
 * Version 1.1
 * - bug fix
 *
 * Version 1.0
 * - initial release
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19

gb          .req    grn
rg          .req    red

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

.loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8

    vsri.8  red, grn, #5
    vshl.u8 gb, grn, #3
    vsri.8  gb, blu, #3

    vst2.8  {gb, rg}, [pDst]!
    bgt     .loop

    bx      lr

回复收藏 0 原文

标点 2024-12-16 08:42:48

如果您使用的是 iOS 或 OS X，那么您可能会很高兴在 Accelerate.framework 中发现 vImageConvert_RGBA8888toRGB565() 和朋友。此函数将 8 位值四舍五入到最接近的 565 值。

为了获得更好的抖动（其质量与 8 位颜色几乎没有区别），请尝试 vImageConvert_AnyToAny()：

vImage_CGImageFormat RGBA8888Format = 
{
    .bitsPerComponent = 8,
    .bitsPerPixel = 32,
    .bitmapInfo = kCGBitmapByteOrderDefault | kCGImageAlphaNoneSkipLast,
    .colorSpace = NULL,  // sRGB or substitute your own in
};

vImage_CGImageFormat RGB565Format = 
{
    .bitsPerComponent = 5,
    .bitsPerPixel = 16,
    .bitmapInfo = kCGBitmapByteOrder16Little | kCGImageAlphaNone,
    .colorSpace = RGBA8888Format.colorSpace,  
};


err = vImageConverterRef converter = vImageConverter_CreateWithCGImageFormat(
         &RGBA8888Format, &RGB565Format, NULL, kvImageNoFlags, &err );

err = vImageConvert_AnyToAny( converter, &src, &dest, NULL, kvImageNoFlags );

这两种方法都将进行矢量化和多线程处理，以获得最佳性能。

If you are on iOS or OS X, then you may be delighted to discover vImageConvert_RGBA8888toRGB565() and friends, in Accelerate.framework. This function rounds the 8-bit values to nearest 565 value.

For even better dithering, the quality of which is nearly indistinguishable from 8-bit color, try vImageConvert_AnyToAny():

vImage_CGImageFormat RGBA8888Format = 
{
    .bitsPerComponent = 8,
    .bitsPerPixel = 32,
    .bitmapInfo = kCGBitmapByteOrderDefault | kCGImageAlphaNoneSkipLast,
    .colorSpace = NULL,  // sRGB or substitute your own in
};

vImage_CGImageFormat RGB565Format = 
{
    .bitsPerComponent = 5,
    .bitsPerPixel = 16,
    .bitmapInfo = kCGBitmapByteOrder16Little | kCGImageAlphaNone,
    .colorSpace = RGBA8888Format.colorSpace,  
};


err = vImageConverterRef converter = vImageConverter_CreateWithCGImageFormat(
         &RGBA8888Format, &RGB565Format, NULL, kvImageNoFlags, &err );

err = vImageConvert_AnyToAny( converter, &src, &dest, NULL, kvImageNoFlags );

Either of these approaches will be vectorized and multithreaded for best performance.

回复收藏 0 原文