使用 NEON 优化 RGBA8888 到 RGB565 的转换

发布于 2024-12-09 08:42:48 字数 3524 浏览 0 评论 0原文

我正在尝试使用 NEON 矢量指令集优化 iOS 上的图像格式转换。我认为这可以很好地映射到这一点,因为它处理一堆类似的数据。

不过,我的尝试并没有那么顺利,与天真的 c 实现相比,仅实现了边际加速:

for(int i = 0; i < pixelCount; ++i, ++inPixel32) {
    const unsigned int r = ((*inPixel32 >> 0 ) & 0xFF);
    const unsigned int g = ((*inPixel32 >> 8 ) & 0xFF);
    const unsigned int b = ((*inPixel32 >> 16) & 0xFF);
    *outPixel16++ = ((r >> 3) << 11) | ((g >> 2) << 5) | ((b >> 3) << 0);
}

iPad 2 上的 1 兆像素图像阵列:

格式为 [min avg max n=计时器样本数](以毫秒为单位)

C: [14.446 14.632 18.405 n=1000]ms

氖: [11.920 12.032 15.336 n=1000]ms

我对 NEON 实现的尝试如下:

    int i;
const int pixelsPerLoop = 8;
for(i = 0; i < pixelCount; i += pixelsPerLoop, inPixel32 += pixelsPerLoop, outPixel16 += pixelsPerLoop) {
    //Read all r,g,b pixels into 3 registers
    uint8x8x4_t rgba  = vld4_u8(inPixel32);
    //Right-shift r,g,b as appropriate
    uint8x8_t r = vshr_n_u8(rgba.val[0], 3);
    uint8x8_t g = vshr_n_u8(rgba.val[1], 2);
    uint8x8_t b = vshr_n_u8(rgba.val[2], 3);

    //Widen b
    uint16x8_t r5_g6_b5 = vmovl_u8(b);
    //Widen r
    uint16x8_t r16 = vmovl_u8(r);
    //Left shift into position within 16-bit int
    r16 = vshlq_n_u16(r16, 11);
    r5_g6_b5 |= r16;

    //Widen g
    uint16x8_t g16 = vmovl_u8(g);
    //Left shift into position within 16-bit int
    g16 = vshlq_n_u16(g16, 5);

    r5_g6_b5 |= g16;

    //Now write back to memory
    vst1q_u16(outPixel16, r5_g6_b5);        
}
//Do the remainder on normal flt hardware

代码通过 LLVM 3.0 编译为以下内容(.loc 和额外标签已删除):

_DNConvert_ARGB8888toRGB565:
    push    {r4, r5, r7, lr}
    mov r9, r1
    mov.w   r12, #0
    add r7, sp, #8
    cmp r2, #0
    mov.w   r1, #0
    it  ne
    movne   r1, #1
    cmp r0, #0
    mov.w   r3, #0
    it  ne
    movne   r3, #1
    cmp.w   r9, #0
    mov.w   r4, #0
    it  ne
    movne   r4, #1
    tst.w   r9, #3
    bne LBB0_8
    ands    r1, r3
    ands    r1, r4
    cmp r1, #1
    bne LBB0_8
    movs    r1, #0
    lsr.w   lr, r9, #2
    cmp.w   r1, r9, lsr #2
    bne LBB0_9
    mov r3, r2
    mov r5, r0
    b   LBB0_5
LBB0_4:
    movw    r1, #65528
    add.w   r0, lr, #7
    movt    r1, #32767
    ands    r1, r0
LBB0_5:
    mov.w   r12, #1
    cmp r1, lr
    bhs LBB0_8
    rsb r0, r1, r9, lsr #2
    mov.w   r9, #63488
    mov.w   lr, #2016
    mov.w   r12, #1
LBB0_7:
    ldr r2, [r5], #4
    subs    r0, #1
    and.w   r1, r9, r2, lsl #8
    and.w   r4, lr, r2, lsr #5
    ubfx    r2, r2, #19, #5
    orr.w   r2, r2, r4
    orr.w   r1, r1, r2
    strh    r1, [r3], #2
    bne LBB0_7
LBB0_8:
    mov r0, r12
    pop {r4, r5, r7, pc}
LBB0_9:
    sub.w   r1, lr, #1
    movs    r3, #32
    add.w   r3, r3, r1, lsl #2
    bic r3, r3, #31
    adds    r5, r0, r3
    movs    r3, #16
    add.w   r1, r3, r1, lsl #1
    bic r1, r1, #15
    adds    r3, r2, r1
    movs    r1, #0
LBB0_10:
    vld4.8  {d16, d17, d18, d19}, [r0]!
    adds    r1, #8
    cmp r1, lr
    vshr.u8 d20, d16, #3
    vshr.u8 d21, d17, #2
    vshr.u8 d16, d18, #3
    vmovl.u8    q11, d20
    vmovl.u8    q9, d21
    vmovl.u8    q8, d16
    vshl.i16    q10, q11, #11
    vshl.i16    q9, q9, #5
    vorr    q8, q8, q10
    vorr    q8, q8, q9
    vst1.16 {d16, d17}, [r2]!
Ltmp28:
    blo LBB0_10
    b   LBB0_4

完整代码可在 https://github.com/darknoon/DNImageConvert 我将不胜感激任何帮助,谢谢!

I'm trying to optimize an image format conversion on iOS using the NEON vector instruction set. I assumed this would map well to that because it processes a bunch of similar data.

My attempts haven't gone that well, though, achieving only a marginal speedup vs the naive c implementation:

for(int i = 0; i < pixelCount; ++i, ++inPixel32) {
    const unsigned int r = ((*inPixel32 >> 0 ) & 0xFF);
    const unsigned int g = ((*inPixel32 >> 8 ) & 0xFF);
    const unsigned int b = ((*inPixel32 >> 16) & 0xFF);
    *outPixel16++ = ((r >> 3) << 11) | ((g >> 2) << 5) | ((b >> 3) << 0);
}

1 megapixel image array on iPad 2:

format is [min avg max n=number of timer samples] in milliseconds

C:
[14.446 14.632 18.405 n=1000]ms

NEON:
[11.920 12.032 15.336 n=1000]ms

My attempt at a NEON implementation is below:

    int i;
const int pixelsPerLoop = 8;
for(i = 0; i < pixelCount; i += pixelsPerLoop, inPixel32 += pixelsPerLoop, outPixel16 += pixelsPerLoop) {
    //Read all r,g,b pixels into 3 registers
    uint8x8x4_t rgba  = vld4_u8(inPixel32);
    //Right-shift r,g,b as appropriate
    uint8x8_t r = vshr_n_u8(rgba.val[0], 3);
    uint8x8_t g = vshr_n_u8(rgba.val[1], 2);
    uint8x8_t b = vshr_n_u8(rgba.val[2], 3);

    //Widen b
    uint16x8_t r5_g6_b5 = vmovl_u8(b);
    //Widen r
    uint16x8_t r16 = vmovl_u8(r);
    //Left shift into position within 16-bit int
    r16 = vshlq_n_u16(r16, 11);
    r5_g6_b5 |= r16;

    //Widen g
    uint16x8_t g16 = vmovl_u8(g);
    //Left shift into position within 16-bit int
    g16 = vshlq_n_u16(g16, 5);

    r5_g6_b5 |= g16;

    //Now write back to memory
    vst1q_u16(outPixel16, r5_g6_b5);        
}
//Do the remainder on normal flt hardware

Code was compiled via LLVM 3.0 into the following (.loc and extra labels removed):

_DNConvert_ARGB8888toRGB565:
    push    {r4, r5, r7, lr}
    mov r9, r1
    mov.w   r12, #0
    add r7, sp, #8
    cmp r2, #0
    mov.w   r1, #0
    it  ne
    movne   r1, #1
    cmp r0, #0
    mov.w   r3, #0
    it  ne
    movne   r3, #1
    cmp.w   r9, #0
    mov.w   r4, #0
    it  ne
    movne   r4, #1
    tst.w   r9, #3
    bne LBB0_8
    ands    r1, r3
    ands    r1, r4
    cmp r1, #1
    bne LBB0_8
    movs    r1, #0
    lsr.w   lr, r9, #2
    cmp.w   r1, r9, lsr #2
    bne LBB0_9
    mov r3, r2
    mov r5, r0
    b   LBB0_5
LBB0_4:
    movw    r1, #65528
    add.w   r0, lr, #7
    movt    r1, #32767
    ands    r1, r0
LBB0_5:
    mov.w   r12, #1
    cmp r1, lr
    bhs LBB0_8
    rsb r0, r1, r9, lsr #2
    mov.w   r9, #63488
    mov.w   lr, #2016
    mov.w   r12, #1
LBB0_7:
    ldr r2, [r5], #4
    subs    r0, #1
    and.w   r1, r9, r2, lsl #8
    and.w   r4, lr, r2, lsr #5
    ubfx    r2, r2, #19, #5
    orr.w   r2, r2, r4
    orr.w   r1, r1, r2
    strh    r1, [r3], #2
    bne LBB0_7
LBB0_8:
    mov r0, r12
    pop {r4, r5, r7, pc}
LBB0_9:
    sub.w   r1, lr, #1
    movs    r3, #32
    add.w   r3, r3, r1, lsl #2
    bic r3, r3, #31
    adds    r5, r0, r3
    movs    r3, #16
    add.w   r1, r3, r1, lsl #1
    bic r1, r1, #15
    adds    r3, r2, r1
    movs    r1, #0
LBB0_10:
    vld4.8  {d16, d17, d18, d19}, [r0]!
    adds    r1, #8
    cmp r1, lr
    vshr.u8 d20, d16, #3
    vshr.u8 d21, d17, #2
    vshr.u8 d16, d18, #3
    vmovl.u8    q11, d20
    vmovl.u8    q9, d21
    vmovl.u8    q8, d16
    vshl.i16    q10, q11, #11
    vshl.i16    q9, q9, #5
    vorr    q8, q8, q10
    vorr    q8, q8, q9
    vst1.16 {d16, d17}, [r2]!
Ltmp28:
    blo LBB0_10
    b   LBB0_4

Full code is available at https://github.com/darknoon/DNImageConvert I would appreciate any help, thanks!

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(5

深者入戏 2024-12-16 08:42:48

在这里,手工优化的 NEON 实现已准备好用于 XCode:

/* IT DOESN'T WORK!!! USE THE NEXT VERSION BELOW.
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19
rg          .req    red
gb          .req    blu

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8
    vshr.u8 red, red, #3
    vext.8  rg, grn, red, #5
    vshr.u8 grn, grn, #2
    vext.8  gb, blu, grn, #3
    vst2.8  {gb, rg}, [pDst]!
    bgt     loop

    bx      lr

此版本将比您建议的速度快很多倍:

  • 通过 PLD 提高缓存命中率

  • 转换为“ long” 没有必要

  • 循环内的指令较少

尽管仍有一些优化空间,您可以修改循环以使其转换16像素每次迭代而不是 8 次。
然后,您可以安排指令来完全避免两次停顿(这在上面的 8/迭代版本中根本不可能实现),并另外受益于 NEON 的双发出功能。

我没有这样做,因为这会使代码难以理解。

了解 VEXT 的作用非常重要。

现在就看你了。 :)

我验证了这段代码可以在 Xcode 下正确编译。
虽然我很确定它也能正常工作,但我不能保证这一点,因为我没有测试环境。
如果出现故障,请告诉我。那么我会相应地纠正它。

cya

===================================================== ===============================

好吧,这是改进的版本。

由于 VSRI 指令的性质不允许除目标之外的两个操作数,因此不可能创建一个关于寄存器分配的更稳健的操作数。

请检查源图像的图像格式。 (元素的精确字节顺序)

如果不是 B、G、R、A(iOS 上默认的本机字节顺序),您的应用程序将严重受到 iOS 内部转换的影响。

如果无论出于何种原因绝对不可能更改此设置,请告诉我。
我会写一个与之匹配的新版本。

PS:我忘记删除函数原型开头的下划线。现在它消失了。

/*
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 *
 * Version 1.1
 * - bug fix
 *
 * Version 1.0
 * - initial release
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19

gb          .req    grn
rg          .req    red

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

.loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8

    vsri.8  red, grn, #5
    vshl.u8 gb, grn, #3
    vsri.8  gb, blu, #3

    vst2.8  {gb, rg}, [pDst]!
    bgt     .loop

    bx      lr

Here you are, hand-optimized NEON implementation ready for XCode :

/* IT DOESN'T WORK!!! USE THE NEXT VERSION BELOW.
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19
rg          .req    red
gb          .req    blu

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8
    vshr.u8 red, red, #3
    vext.8  rg, grn, red, #5
    vshr.u8 grn, grn, #2
    vext.8  gb, blu, grn, #3
    vst2.8  {gb, rg}, [pDst]!
    bgt     loop

    bx      lr

This version will be many times faster than what you suggested :

  • increased cache hit rate via PLD

  • conversion to "long" not necessary

  • fewer instructions within the loop

There is still some room for optimizations though, you could modify the loop so that it converts 16 pixels per iteration instead of 8.
Then you can schedule the instructions to avoid the two stalls completely (which is simply not possible in this 8/iteration version above) and benefit from NEON's dual-issue capability in addition.

I didn't do this because it would make the code hard to understand.

It's important to know what VEXT is supposed to do.

Now it's up to you. :)

I verified this code to be properly compiled under Xcode.
Although I'm pretty sure it works correctly as well, I cannot guarantee this since I don't have the test environment.
In case of malfunctioning, please let me know. I'll correct it accordingly then.

cya

==============================================================================

Well, here is the improved version.

Due to the nature of the VSRI instruction not allowing two operands other than the target, it was not possible to create a more robust one regarding the register assignment.

Please check the image format of your source image. (exact byte order of the elements)

If it's not B, G, R, A, which is the default and native one on iOS, your application will suffer heavily from internal conversions by iOS.

If it's absolutely not possible to change this for whatever the reason, let me know.
I'll write a new version matching it.

PS : I forgot to remove the underscore at the start of the function prototype. Now it's gone.

/*
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 *
 * Version 1.1
 * - bug fix
 *
 * Version 1.0
 * - initial release
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19

gb          .req    grn
rg          .req    red

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

.loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8

    vsri.8  red, grn, #5
    vshl.u8 gb, grn, #3
    vsri.8  gb, blu, #3

    vst2.8  {gb, rg}, [pDst]!
    bgt     .loop

    bx      lr
标点 2024-12-16 08:42:48

如果您使用的是 iOS 或 OS X,那么您可能会很高兴在 Accelerate.framework 中发现 vImageConvert_RGBA8888toRGB565() 和朋友。此函数将 8 位值四舍五入到最接近的 565 值。

为了获得更好的抖动(其质量与 8 位颜色几乎没有区别),请尝试 vImageConvert_AnyToAny():

vImage_CGImageFormat RGBA8888Format = 
{
    .bitsPerComponent = 8,
    .bitsPerPixel = 32,
    .bitmapInfo = kCGBitmapByteOrderDefault | kCGImageAlphaNoneSkipLast,
    .colorSpace = NULL,  // sRGB or substitute your own in
};

vImage_CGImageFormat RGB565Format = 
{
    .bitsPerComponent = 5,
    .bitsPerPixel = 16,
    .bitmapInfo = kCGBitmapByteOrder16Little | kCGImageAlphaNone,
    .colorSpace = RGBA8888Format.colorSpace,  
};


err = vImageConverterRef converter = vImageConverter_CreateWithCGImageFormat(
         &RGBA8888Format, &RGB565Format, NULL, kvImageNoFlags, &err );

err = vImageConvert_AnyToAny( converter, &src, &dest, NULL, kvImageNoFlags );

这两种方法都将进行矢量化和多线程处理,以获得最佳性能。

If you are on iOS or OS X, then you may be delighted to discover vImageConvert_RGBA8888toRGB565() and friends, in Accelerate.framework. This function rounds the 8-bit values to nearest 565 value.

For even better dithering, the quality of which is nearly indistinguishable from 8-bit color, try vImageConvert_AnyToAny():

vImage_CGImageFormat RGBA8888Format = 
{
    .bitsPerComponent = 8,
    .bitsPerPixel = 32,
    .bitmapInfo = kCGBitmapByteOrderDefault | kCGImageAlphaNoneSkipLast,
    .colorSpace = NULL,  // sRGB or substitute your own in
};

vImage_CGImageFormat RGB565Format = 
{
    .bitsPerComponent = 5,
    .bitsPerPixel = 16,
    .bitmapInfo = kCGBitmapByteOrder16Little | kCGImageAlphaNone,
    .colorSpace = RGBA8888Format.colorSpace,  
};


err = vImageConverterRef converter = vImageConverter_CreateWithCGImageFormat(
         &RGBA8888Format, &RGB565Format, NULL, kvImageNoFlags, &err );

err = vImageConvert_AnyToAny( converter, &src, &dest, NULL, kvImageNoFlags );

Either of these approaches will be vectorized and multithreaded for best performance.

烟─花易冷 2024-12-16 08:42:48

您可能想使用 vld4q_u8() 而不是 vld4_u8() 并相应地调整其余代码。很难判断问题出在哪里,但汇编器看起来还不错。

You might want to use vld4q_u8() instead of vld4_u8() and adjust the rest of your code accordingly. It's hard to tell where the problem might be, but the assembler doesn't look too bad otherwise.

孤独难免 2024-12-16 08:42:48

(我不熟悉 NEON,也不深入了解 Ipad2 的内存系统,但这就是我们过去使用 88110 像素操作所做的事情,它是当今 SIMD 扩展的早期先驱)

内存延迟有多大?

您能否通过展开内部循环并在“前一个”值上运行 NEON 指令来隐藏它,同时 ARM 从内存中提取“下一个”值?简要浏览 NEON 手册意味着您可以并行运行 ARM 和 NEON 指令。

(I'm not familiar with NEON, nor deeply with the memory system of the Ipad2, but this is what we used to do with 88110 pixel-ops, which were an early precursor to today's SIMD extensions)

How big is the memory latency?

Could you hide it by unrolling the inner loop and running the NEON instructions on the "previous" values while the ARM pulls the "next" values from memory? A brief scan of the NEON manual implies you can run ARM and NEON instructions in parallel.

说不完的你爱 2024-12-16 08:42:48

我不认为将 vld4_u8 转换为 vld4q_u8 会导致性能的任何改善。

代码看起来很简单。我不擅长 ASM,因此需要一些时间来深入研究它。

霓虹灯看起来很简单。但我不太确定是否使用 r5_g6_b5 |= g16 代替 vorrq_u16

请也看看优化级别。据我所知,neon 代码优化级别最大为 1。因此,当参考代码和 neon 代码都考虑默认优化时,性能可能会有所不同,因为 DEFAULT 的参考优化级别可能是不同的。

我在 neon 中没有发现任何可以改进当前代码的区域。

I don't think converting vld4_u8 to vld4q_u8 would lead to any bettering of the performance.

The code seems simple enough. I am not good at ASM and so it would take some time to look into it deeply.

The neon seems simple enough. But I am not quiet sure about r5_g6_b5 |= g16 being used instead of vorrq_u16

Please have a look at the optimization level too. As far as what I heard neon code optimization level goes to a maximum of 1. So the performance may differ when default optimization is being taken into account for both the reference code and neon code, as the level of optimization of reference by DEFAULT may be different.

I doesnt find any area in neon that can better the current code.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文