如何优化通过霓虹灯内在的图像添加填充的功能？

发布于 2025-02-03 03:56:16 字数 620 浏览 1 评论 0原文

我是霓虹灯的新手，尽管我可以进行一些处理，但我在某些基本概念上缺乏知识而挣扎，尤其是在优化2D阵列方面。

uint8_t** add_padding(uint8_t** img,int width, int height) {

    uint8_t** padded_image = (uint8_t**)calloc((height + 2), sizeof(uint8_t*));

    for (int i = 0; i < height + 2; i++) {
        if (padded_image) {
            padded_image[i] = (uint8_t*)calloc((width + 2), sizeof(uint8_t));
        }
    }

    for (int i = 1; i < height +1 ; i++) {
        for (int j = 1; j < width + 1; j++) {
            padded_image[i][j] = img[i - 1][j - 1];
        }
    }


    return padded_image;
}

如何在C中使用霓虹灯内在的函数对函数进行矢量化？

原文

im new to NEON and whilst i can do some processing i struggle with lack of knowledge at some basics concepts especially with optimizing 2d arrays.

uint8_t** add_padding(uint8_t** img,int width, int height) {

    uint8_t** padded_image = (uint8_t**)calloc((height + 2), sizeof(uint8_t*));

    for (int i = 0; i < height + 2; i++) {
        if (padded_image) {
            padded_image[i] = (uint8_t*)calloc((width + 2), sizeof(uint8_t));
        }
    }

    for (int i = 1; i < height +1 ; i++) {
        for (int j = 1; j < width + 1; j++) {
            padded_image[i][j] = img[i - 1][j - 1];
        }
    }


    return padded_image;
}

How can i vectorize function above using NEON intrinsics in C ?

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

雪花飘飘的天空 2025-02-10 03:56:16

弹出了两件事。

如果可能，请使用连续的内存分配。

计划如何实现多余的数据宽度（SIMD宽度的非媒介）。我通常使用重叠的SIMD寄存器的方法：

 if (width < 16) abort();   // or handle with non-simd methods

 *output_row++ = 0;  // write the left margin
 
 // handle multiples of SIMD width -- read and write 16 bytes at a time
 while (width >= 16) {
    uint8x16_t data = vld1q_u8(input_row);
    input_row += 16;
    width -= 16;
    vst1q_u8(output_row, data);
    output_row += 16;
 }

 // handle the excess by loading the 16 last characters of the row
 if (width) {
    input_row += width;
    output_row += width;
    uint8x16_t data = vld1q_u8(input_row - 16);
    vst1q_u8(output_row - 16, data);
 }

 // handle the right margin
 *output_row++ = 0;

在每个侧面提示您都将使用一些3x3内核过滤图像的边距1，但是即使没有明确的余量，也可以有效地进行过滤。

 if (width < 16) return alternative_non_simd_implementation();

 uint8x16_t previous = vdupq_n_u8(0);   // zero initial margin
 // vs uint8x16_t previous = vdupq_n_u8(*input_row); // replicated margin

 // horizontally form three vectors
 // Z|0123456789abcde        <-- previous, Z was not read from the image
 //   0123456789abcdeF       <-- current
 //    123456789abcdefG      <-- next
 while (width > 16) {
    uint8x16_t next = vdupq_n_u8(input_row[16]); // peek 1 element
    uint8x16_t current = vld1q_u8(input_row);
    previous = vextq_u8(previous, current, 15);
    next = vextq_u8(current, next, 1);
    // then compute something from the 3 vectors, write it to target
    current = vmaxq_u8(vmaxq_u8(previous, next), current);
    vst1q_u8(output_row, current);
    previous = current;  // update the context
    width -= 16;
    input_row += 16;
 }
 // then handle the excess...
 // previous = ........Z, in case width == 16
 // previous = ........F, in case width == 32
 // if width % 16 == 0, previous contains already a valid byte
 // otherwise we need to read it explicitly
 if (width & 15) {
    // revert the input pointer so that the last byte we read
    // falls in the lane 15 of the SIMD 
    input_row -= 16;
    input_row += width;
    output_row -= 16;
    output_row += width;
    // there is at least one readable data that is not margin
    // to the left of input_row, which we read
    previous = vdupq_n_u8(input_row[-1]);
 }
 
 uint8x16_t current = vld1q_u8(input_row);
 uint8x16_t next = vdupq_n_u8(0);       // right margin
 previous = vextq_u8(previous, current, 15);
 next = vextq_u8(current, next, 1);

 // again, compute something + write it
 current = vmaxq_u8(vmaxq_u8(previous, next), current);
 vst1q_u8(output_row, current);

该碎片只处理了一排，但当然可以扩展到三个指针中读取。

 void outer_loop(uint8_t *start_ptr, uint8_t *out_ptr, int width, int height) {
   uint8_t *zero = calloc(width);  // have a single zero row
   
   for (int i = 0; i < height; i++) {
      uint8_t *mid_row = start_ptr + i * width;
      uint8_t *top_row = i > 0 ? mid_row - width : zero;
      uint8_t *bot_row = i < height-1 ? mid_row + width : zero;
      
      process_three_rows(top_row, mid_row, bot_row, width, out_ptr);
      out_ptr += width;
   }
   free(zero);
 }

Two things pop up.

If possible, use contiguous memory allocation.

Plan how you are going to implement excess data widths (non-multiples of SIMD width). I typically use the method of overlapped SIMD registers:

 if (width < 16) abort();   // or handle with non-simd methods

 *output_row++ = 0;  // write the left margin
 
 // handle multiples of SIMD width -- read and write 16 bytes at a time
 while (width >= 16) {
    uint8x16_t data = vld1q_u8(input_row);
    input_row += 16;
    width -= 16;
    vst1q_u8(output_row, data);
    output_row += 16;
 }

 // handle the excess by loading the 16 last characters of the row
 if (width) {
    input_row += width;
    output_row += width;
    uint8x16_t data = vld1q_u8(input_row - 16);
    vst1q_u8(output_row - 16, data);
 }

 // handle the right margin
 *output_row++ = 0;

A margin of 1 at every side hints that you are going to filter the image using some 3x3 kernel, but the filtering can be done efficiently even without the explicit margin.

 if (width < 16) return alternative_non_simd_implementation();

 uint8x16_t previous = vdupq_n_u8(0);   // zero initial margin
 // vs uint8x16_t previous = vdupq_n_u8(*input_row); // replicated margin

 // horizontally form three vectors
 // Z|0123456789abcde        <-- previous, Z was not read from the image
 //   0123456789abcdeF       <-- current
 //    123456789abcdefG      <-- next
 while (width > 16) {
    uint8x16_t next = vdupq_n_u8(input_row[16]); // peek 1 element
    uint8x16_t current = vld1q_u8(input_row);
    previous = vextq_u8(previous, current, 15);
    next = vextq_u8(current, next, 1);
    // then compute something from the 3 vectors, write it to target
    current = vmaxq_u8(vmaxq_u8(previous, next), current);
    vst1q_u8(output_row, current);
    previous = current;  // update the context
    width -= 16;
    input_row += 16;
 }
 // then handle the excess...
 // previous = ........Z, in case width == 16
 // previous = ........F, in case width == 32
 // if width % 16 == 0, previous contains already a valid byte
 // otherwise we need to read it explicitly
 if (width & 15) {
    // revert the input pointer so that the last byte we read
    // falls in the lane 15 of the SIMD 
    input_row -= 16;
    input_row += width;
    output_row -= 16;
    output_row += width;
    // there is at least one readable data that is not margin
    // to the left of input_row, which we read
    previous = vdupq_n_u8(input_row[-1]);
 }
 
 uint8x16_t current = vld1q_u8(input_row);
 uint8x16_t next = vdupq_n_u8(0);       // right margin
 previous = vextq_u8(previous, current, 15);
 next = vextq_u8(current, next, 1);

 // again, compute something + write it
 current = vmaxq_u8(vmaxq_u8(previous, next), current);
 vst1q_u8(output_row, current);

That fragment only handled one row, but it can of course be extended to read from three pointers.

 void outer_loop(uint8_t *start_ptr, uint8_t *out_ptr, int width, int height) {
   uint8_t *zero = calloc(width);  // have a single zero row
   
   for (int i = 0; i < height; i++) {
      uint8_t *mid_row = start_ptr + i * width;
      uint8_t *top_row = i > 0 ? mid_row - width : zero;
      uint8_t *bot_row = i < height-1 ? mid_row + width : zero;
      
      process_three_rows(top_row, mid_row, bot_row, width, out_ptr);
      out_ptr += width;
   }
   free(zero);
 }

回复收藏 0 原文

~没有更多了~