使用arm neon进行RGB到灰度转换

发布于 2024-12-20 22:31:56 字数 2368 浏览 1 评论 0原文

我正在尝试有效地从 RGB 转换为灰度，所以我从此处解释了如何从 rgba 转换为灰度。现在我正在尝试做同样的事情，但只使用 RGB。我改变了一些东西，但似乎效果不佳。我不知道为什么，有人看到我的错误吗？

void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
    __asm__ volatile(
     "lsr %2, %2, #3 \n"
     "# build the three constants:  \n"
     "mov r4, #28                   \n" // Blue channel multiplier
     "mov r5, #151                  \n" // Green channel multiplier
     "mov r6, #77                   \n" // Red channel multiplier
     "vdup.8 d4, r4                 \n"
     "vdup.8 d5, r5                 \n"
     "vdup.8 d6, r6                 \n"
     "0: \n"
     "# load 8 pixels: \n"  //RGBR
     "vld4.8 {d0-d3}, [%1]! \n"
     "# do the weight average: \n"
     "vmull.u8 q7, d0, d4 \n"
     "vmlal.u8 q7, d1, d5 \n"
     "vmlal.u8 q7, d2, d6 \n"
     "# shift and store: \n"
     "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
     "vst1.8 {d7}, [%0]! \n"
     "subs %2, %2, #1 \n" // Decrement iteration count

     "# load 8 pixels: \n"
     "vld4.8 {d8-d11}, [%1]! \n" //Other GBRG
     "# do the weight average: \n"
     "vmull.u8 q7, d3, d4 \n"
     "vmlal.u8 q7, d8, d5 \n"
     "vmlal.u8 q7, d9, d6 \n"
     "# shift and store: \n"
     "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
     "vst1.8 {d7}, [%0]! \n"
     "subs %2, %2, #1 \n" // Decrement iteration count

     "# load 8 pixels: \n"
     "vld4.8 {d0-d3}, [%1]! \n"
     "# do the weight average: \n"
     "vmull.u8 q7, d10, d4 \n"
     "vmlal.u8 q7, d11, d5 \n"
     "vmlal.u8 q7, d0, d6 \n"
     "# shift and store: \n"
     "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
     "vst1.8 {d7}, [%0]! \n"
     "subs %2, %2, #1 \n" // Decrement iteration count


     "# do the weight average: \n"
     "vmull.u8 q7, d1, d4 \n"
     "vmlal.u8 q7, d2, d5 \n"
     "vmlal.u8 q7, d3, d6 \n"
     "# shift and store: \n"
     "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
     "vst1.8 {d7}, [%0]! \n"

     "subs %2, %2, #1 \n" // Decrement iteration count



     "bne 0b \n" // Repeat unil iteration count is not zero
     :
     : "r"(dest), "r"(src), "r"(numPixels)
     : "r4", "r5", "r6"
    );
}

原文

I´m trying to convert from rgb to grayscale efficiently, so I got a function from here where it explains how to convert from rgba to grayscale. Now I´m trying to do the same but with just rgb. I changed some things but it doesn´t seem to work well. I don´t know why, does anyone see my mistake?

void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
    __asm__ volatile(
     "lsr %2, %2, #3 \n"
     "# build the three constants:  \n"
     "mov r4, #28                   \n" // Blue channel multiplier
     "mov r5, #151                  \n" // Green channel multiplier
     "mov r6, #77                   \n" // Red channel multiplier
     "vdup.8 d4, r4                 \n"
     "vdup.8 d5, r5                 \n"
     "vdup.8 d6, r6                 \n"
     "0: \n"
     "# load 8 pixels: \n"  //RGBR
     "vld4.8 {d0-d3}, [%1]! \n"
     "# do the weight average: \n"
     "vmull.u8 q7, d0, d4 \n"
     "vmlal.u8 q7, d1, d5 \n"
     "vmlal.u8 q7, d2, d6 \n"
     "# shift and store: \n"
     "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
     "vst1.8 {d7}, [%0]! \n"
     "subs %2, %2, #1 \n" // Decrement iteration count

     "# load 8 pixels: \n"
     "vld4.8 {d8-d11}, [%1]! \n" //Other GBRG
     "# do the weight average: \n"
     "vmull.u8 q7, d3, d4 \n"
     "vmlal.u8 q7, d8, d5 \n"
     "vmlal.u8 q7, d9, d6 \n"
     "# shift and store: \n"
     "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
     "vst1.8 {d7}, [%0]! \n"
     "subs %2, %2, #1 \n" // Decrement iteration count

     "# load 8 pixels: \n"
     "vld4.8 {d0-d3}, [%1]! \n"
     "# do the weight average: \n"
     "vmull.u8 q7, d10, d4 \n"
     "vmlal.u8 q7, d11, d5 \n"
     "vmlal.u8 q7, d0, d6 \n"
     "# shift and store: \n"
     "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
     "vst1.8 {d7}, [%0]! \n"
     "subs %2, %2, #1 \n" // Decrement iteration count


     "# do the weight average: \n"
     "vmull.u8 q7, d1, d4 \n"
     "vmlal.u8 q7, d2, d5 \n"
     "vmlal.u8 q7, d3, d6 \n"
     "# shift and store: \n"
     "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
     "vst1.8 {d7}, [%0]! \n"

     "subs %2, %2, #1 \n" // Decrement iteration count



     "bne 0b \n" // Repeat unil iteration count is not zero
     :
     : "r"(dest), "r"(src), "r"(numPixels)
     : "r4", "r5", "r6"
    );
}

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

北陌 2024-12-27 22:31:56

您应该使用 "vld3.8 {d0-d2}, [%1]! \n"

另请参阅 http://hilbert-space.de/?p=22

回复收藏 0 原文

失眠症患者 2024-12-27 22:31:56

您加载四个值 (RGBA)，而不是 3 个值 (RGB)。

您的图像中有RGB RGB RGB，但您在连续步骤中加载RGBR GBRG B...等。

"vld4.8 {d0-d3}, [%1]! \n"

相反，您应该

"vld3.8 {d0-d2}, [%1]! \n"

注意，我不知道我的 asm 是否正确，但这是错误。
将像素移回内存时还要检查是否存在相同的错误

You load four values (RGBA) instead of 3 (RGB).

you have RGB RGB RGB in your image, but you load RGBR GBRG B... etc in consecutive steps.

"vld4.8 {d0-d3}, [%1]! \n"

Instead you should

"vld3.8 {d0-d2}, [%1]! \n"

Note that I have no idea if my asm is correct, but here is the mistake.
Also check for the same mistake when moving the pixels back to memory

回复收藏 0 原文

惟欲睡 2024-12-27 22:31:56

瓦西里是对的。使用VLD3加载24位像素。

您还有 3 个 VLDx 用于 4 个 VSTx
事实上你的代码很奇怪......

你不必重复代码。这解释起来相当复杂，但你对 NEON 没有兴趣重复 4 次你的代码

void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
  __asm__ volatile(
   "# build the three constants:  \n"
   "mov r4, #28                   \n" // Blue channel multiplier
   "mov r5, #151                  \n" // Green channel multiplier
   "mov r6, #77                   \n" // Red channel multiplier
   "vdup.8 d4, r4                 \n"
   "vdup.8 d5, r5                 \n"
   "vdup.8 d6, r6                 \n"

   "0: \n"
   "# load 8 pixels: \n"  //RGBR
   "vld3.8 {d0-d2}, [%1]! \n"
   "# do the weight average: \n"
   "vmull.u8 q7, d0, d4 \n"
   "vmlal.u8 q7, d1, d5 \n"
   "vmlal.u8 q7, d2, d6 \n"
   "# shift and store: \n"
   "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
   "vst1.8 {d7}, [%0]! \n"
   "subs %2, %2, #1 \n" // Decrement iteration count
   "bne 0b \n" // Repeat unil iteration count is not zero
   :
   : "r"(dest), "r"(src), "r"(numPixels)
   : "r4", "r5", "r6"
  );
}

应该可以工作。

Vasile is right. Use VLD3 to load 24bit pixels.

You also have 3 VLDx for 4 VSTx
In fact your code is quite strange...

You don't have to duplicate the code. That's quite complexe to explain but you'll have no interest with NEON to repeat 4 times your code

void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
  __asm__ volatile(
   "# build the three constants:  \n"
   "mov r4, #28                   \n" // Blue channel multiplier
   "mov r5, #151                  \n" // Green channel multiplier
   "mov r6, #77                   \n" // Red channel multiplier
   "vdup.8 d4, r4                 \n"
   "vdup.8 d5, r5                 \n"
   "vdup.8 d6, r6                 \n"

   "0: \n"
   "# load 8 pixels: \n"  //RGBR
   "vld3.8 {d0-d2}, [%1]! \n"
   "# do the weight average: \n"
   "vmull.u8 q7, d0, d4 \n"
   "vmlal.u8 q7, d1, d5 \n"
   "vmlal.u8 q7, d2, d6 \n"
   "# shift and store: \n"
   "vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
   "vst1.8 {d7}, [%0]! \n"
   "subs %2, %2, #1 \n" // Decrement iteration count
   "bne 0b \n" // Repeat unil iteration count is not zero
   :
   : "r"(dest), "r"(src), "r"(numPixels)
   : "r4", "r5", "r6"
  );
}

Should works.

回复收藏 0 原文

~没有更多了~