如何使用CUDA快速找到另一张图像中的图像？

发布于 2024-12-04 08:12:05 字数 9749 浏览 0 评论 0原文

在我当前的项目中，我需要找到另一张较大尺寸图像中包含的图像的像素精确位置。较小的图像永远不会旋转或拉伸（因此应该逐像素匹配），但它可能具有不同的亮度，并且图像中的某些像素可能会扭曲。我的第一次尝试是在 CPU 上完成，但速度太慢。计算非常并行，所以我决定使用GPU。我刚刚开始学习 CUDA 并编写了我的第一个 CUDA 应用程序。我的代码可以工作，但即使在 GPU 上它仍然太慢。当较大图像的尺寸为 1024x1280 且较小图像尺寸为 128x128 时，程序在 GeForce GTX 560 ti 上执行计算需要 2000 毫秒。我需要在 200 毫秒内得到结果。将来我可能需要更复杂的算法，所以我宁愿有更多的计算能力储备。问题是我如何优化我的代码以实现加速？

CUDAImageLib.dll：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <cutil.h>

//#define SUPPORT_ALPHA

__global__ void ImageSearch_kernel(float* BufferOut, float* BufferB, float* BufferS, unsigned int bw, unsigned int bh, unsigned int sw, unsigned int sh)
{
    unsigned int bx = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int by = threadIdx.y + blockIdx.y * blockDim.y;
    float diff = 0;
    for (unsigned int y = 0; y < sh; ++y)
    {
        for (unsigned int x = 0; x < sw; ++x)
        {
            unsigned int as = (x + y * sw) * 4;
            unsigned int ab = (x + bx + (y + by) * bw) * 4;
#ifdef SUPPORT_ALPHA
            diff += ((abs(BufferS[as] - BufferB[ab]) + abs(BufferS[as + 1] - BufferB[ab + 1]) + abs(BufferS[as + 2] - BufferB[ab + 2])) * BufferS[as + 3] * BufferB[ab + 3]);
#else
            diff += abs(BufferS[as] - BufferB[ab]);
            diff += abs(BufferS[as + 1] - BufferB[ab + 1]);
            diff += abs(BufferS[as + 2] - BufferB[ab + 2]);     
#endif
        }
    }
    BufferOut[bx + (by * (bw - sw))] = diff;
}

extern "C" int __declspec(dllexport) __stdcall ImageSearchGPU(float* BufferOut, float* BufferB, float* BufferS, int bw, int bh, int sw, int sh)
{
    int aBytes = (bw * bh) * 4 * sizeof(float);
    int bBytes = (sw * sh) * 4 * sizeof(float);
    int cBytes = ((bw - sw) * (bh - sh)) * sizeof(float);

    dim3 threadsPerBlock(32, 32);
    dim3 numBlocks((bw - sw) / threadsPerBlock.x, (bh - sh) / threadsPerBlock.y);

    float *dev_B = 0;
    float *dev_S = 0;
    float *dev_Out = 0;

    unsigned int timer = 0;
    float sExecutionTime = 0;

    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_Out, cBytes);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_B, aBytes);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_S, bBytes);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_B, BufferB, aBytes, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_S, BufferS, bBytes, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cutCreateTimer(&timer);
    cutStartTimer(timer);

    // Launch a kernel on the GPU with one thread for each element.
    ImageSearch_kernel<<<numBlocks, threadsPerBlock>>>(dev_Out, dev_B, dev_S, bw, bh, sw, sh);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    cutStopTimer(timer);
    sExecutionTime = cutGetTimerValue(timer);

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(BufferOut, dev_Out, cBytes, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_Out);
    cudaFree(dev_B);
    cudaFree(dev_S);
    return (int)sExecutionTime;
}

extern "C" int __declspec(dllexport) __stdcall FindMinCPU(float* values, int count)
{
    int minIndex = 0;
    float minValue = 3.4e+38F;
    for (int i = 0; i < count; ++i)
    {
        if (values[i] < minValue)
        {
            minValue = values[i];
            minIndex = i;
        }
    }
    return minIndex;
}

C# 测试应用程序：

using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Drawing;

namespace TestCUDAImageSearch
{
    class Program
    {
        static void Main(string[] args)
        {
            using(Bitmap big = new Bitmap("Big.png"), small = new Bitmap("Small.png"))
            {
                Console.WriteLine("Big " + big.Width + "x" + big.Height + "    Small " + small.Width + "x" + small.Height);

                Stopwatch sw = new Stopwatch();
                sw.Start();
                Point point = CUDAImageLIb.ImageSearch(big, small);
                sw.Stop();
                long t = sw.ElapsedMilliseconds;
                Console.WriteLine("Image found at " + point.X + "x" + point.Y);
                Console.WriteLine("total time=" + t + "ms     kernel time=" + CUDAImageLIb.LastKernelTime + "ms");
            }
            Console.WriteLine("Hit key");
            Console.ReadKey();
        }
    }
}



//#define SUPPORT_HSB

using System;
using System.Collections.Generic;
using System.Text;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;

namespace TestCUDAImageSearch
{
    public static class CUDAImageLIb
    {
        [DllImport("CUDAImageLib.dll")]
        private static extern int ImageSearchGPU(float[] bufferOut, float[] bufferB, float[] bufferS, int bw, int bh, int sw, int sh);

        [DllImport("CUDAImageLib.dll")]
        private static extern int FindMinCPU(float[] values, int count);

        private static int _lastKernelTime = 0;

        public static int LastKernelTime
        {
            get { return _lastKernelTime; }
        }

        public static Point ImageSearch(Bitmap big, Bitmap small)
        {
            int bw = big.Width;
            int bh = big.Height;
            int sw = small.Width;
            int sh = small.Height;
            int mx = (bw - sw);
            int my = (bh - sh);

            float[] diffs = new float[mx * my];
            float[] b = ImageToFloat(big);
            float[] s = ImageToFloat(small);
            _lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
            int minIndex = FindMinCPU(diffs, diffs.Length);
            return new Point(minIndex % mx, minIndex / mx);
        }

        public static List<Point> ImageSearch(Bitmap big, Bitmap small, float maxDeviation)
        {
            int bw = big.Width;
            int bh = big.Height;
            int sw = small.Width;
            int sh = small.Height;
            int mx = (bw - sw);
            int my = (bh - sh);
            int nDiff = mx * my;

            float[] diffs = new float[nDiff];
            float[] b = ImageToFloat(big);
            float[] s = ImageToFloat(small);
            _lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);

            List<Point> points = new List<Point>();
            for(int i = 0; i < nDiff; ++i)
            {
                if (diffs[i] < maxDeviation)
                {
                    points.Add(new Point(i % mx, i / mx));
                }
            }
            return points;
        }

#if SUPPORT_HSB

        private static float[] ImageToFloat(Bitmap img)
        {
            int w = img.Width;
            int h = img.Height;
            float[] pix = new float[w * h * 4];
            int i = 0;
            for (int y = 0; y < h; ++y)
            {
                for (int x = 0; x < w; ++x)
                {
                    Color c = img.GetPixel(x, y);
                    pix[i] = c.GetHue() / 360;                   
                    pix[i + 1] = c.GetSaturation();                
                    pix[i + 2] = c.GetBrightness();                    
                    pix[i + 3] = c.A;
                    i += 4;
                }
            }
            return pix;
        }
#else
        private static float[] ImageToFloat(Bitmap bmp)
        {
            int w = bmp.Width;
            int h = bmp.Height;
            int n = w * h;
            float[] pix = new float[n * 4];

            System.Diagnostics.Debug.Assert(bmp.PixelFormat == PixelFormat.Format32bppArgb);
            Rectangle r = new Rectangle(0, 0, w, h);
            BitmapData bmpData = bmp.LockBits(r, ImageLockMode.ReadOnly, bmp.PixelFormat);
            System.Diagnostics.Debug.Assert(bmpData.Stride > 0);
            int[] pixels = new int[n];
            System.Runtime.InteropServices.Marshal.Copy(bmpData.Scan0, pixels, 0, n);
            bmp.UnlockBits(bmpData);

            int j = 0;
            for (int i = 0; i < n; ++i)
            {
                pix[j] = (pixels[i] & 255)  / 255.0f;
                pix[j + 1] = ((pixels[i] >> 8) & 255) / 255.0f;
                pix[j + 2] = ((pixels[i] >> 16) & 255) / 255.0f;
                pix[j + 3] = ((pixels[i] >> 24) & 255) / 255.0f;
                j += 4;
            }
            return pix;
        }
#endif
    }
}

原文

In my current project I need to find pixel exact position of image contained in another image of larger size. Smaller image is never rotated or stretched (so should match pixel by pixel) but it may have different brightness and some pixels in the image may be distorted. My first attemp was to do it on CPU but it was too slow. The calculations are very parallel, so I decided to use the GPU. I just started to learn CUDA and wrote my first CUDA app. My code works but it still is too slow even on GPU. When the larger image has a dimension of 1024x1280 and smaller is 128x128 program performs calculations in 2000ms on GeForce GTX 560 ti. I need to get results in less than 200ms. In the future I'll probably need a more complex algorithm, so I'd rather have even more computational power reserve. The question is how I can optimise my code to achieve that speed up?

CUDAImageLib.dll:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <cutil.h>

//#define SUPPORT_ALPHA

__global__ void ImageSearch_kernel(float* BufferOut, float* BufferB, float* BufferS, unsigned int bw, unsigned int bh, unsigned int sw, unsigned int sh)
{
    unsigned int bx = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int by = threadIdx.y + blockIdx.y * blockDim.y;
    float diff = 0;
    for (unsigned int y = 0; y < sh; ++y)
    {
        for (unsigned int x = 0; x < sw; ++x)
        {
            unsigned int as = (x + y * sw) * 4;
            unsigned int ab = (x + bx + (y + by) * bw) * 4;
#ifdef SUPPORT_ALPHA
            diff += ((abs(BufferS[as] - BufferB[ab]) + abs(BufferS[as + 1] - BufferB[ab + 1]) + abs(BufferS[as + 2] - BufferB[ab + 2])) * BufferS[as + 3] * BufferB[ab + 3]);
#else
            diff += abs(BufferS[as] - BufferB[ab]);
            diff += abs(BufferS[as + 1] - BufferB[ab + 1]);
            diff += abs(BufferS[as + 2] - BufferB[ab + 2]);     
#endif
        }
    }
    BufferOut[bx + (by * (bw - sw))] = diff;
}

extern "C" int __declspec(dllexport) __stdcall ImageSearchGPU(float* BufferOut, float* BufferB, float* BufferS, int bw, int bh, int sw, int sh)
{
    int aBytes = (bw * bh) * 4 * sizeof(float);
    int bBytes = (sw * sh) * 4 * sizeof(float);
    int cBytes = ((bw - sw) * (bh - sh)) * sizeof(float);

    dim3 threadsPerBlock(32, 32);
    dim3 numBlocks((bw - sw) / threadsPerBlock.x, (bh - sh) / threadsPerBlock.y);

    float *dev_B = 0;
    float *dev_S = 0;
    float *dev_Out = 0;

    unsigned int timer = 0;
    float sExecutionTime = 0;

    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_Out, cBytes);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_B, aBytes);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_S, bBytes);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_B, BufferB, aBytes, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_S, BufferS, bBytes, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cutCreateTimer(&timer);
    cutStartTimer(timer);

    // Launch a kernel on the GPU with one thread for each element.
    ImageSearch_kernel<<<numBlocks, threadsPerBlock>>>(dev_Out, dev_B, dev_S, bw, bh, sw, sh);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    cutStopTimer(timer);
    sExecutionTime = cutGetTimerValue(timer);

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(BufferOut, dev_Out, cBytes, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_Out);
    cudaFree(dev_B);
    cudaFree(dev_S);
    return (int)sExecutionTime;
}

extern "C" int __declspec(dllexport) __stdcall FindMinCPU(float* values, int count)
{
    int minIndex = 0;
    float minValue = 3.4e+38F;
    for (int i = 0; i < count; ++i)
    {
        if (values[i] < minValue)
        {
            minValue = values[i];
            minIndex = i;
        }
    }
    return minIndex;
}

C# test app:

using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Drawing;

namespace TestCUDAImageSearch
{
    class Program
    {
        static void Main(string[] args)
        {
            using(Bitmap big = new Bitmap("Big.png"), small = new Bitmap("Small.png"))
            {
                Console.WriteLine("Big " + big.Width + "x" + big.Height + "    Small " + small.Width + "x" + small.Height);

                Stopwatch sw = new Stopwatch();
                sw.Start();
                Point point = CUDAImageLIb.ImageSearch(big, small);
                sw.Stop();
                long t = sw.ElapsedMilliseconds;
                Console.WriteLine("Image found at " + point.X + "x" + point.Y);
                Console.WriteLine("total time=" + t + "ms     kernel time=" + CUDAImageLIb.LastKernelTime + "ms");
            }
            Console.WriteLine("Hit key");
            Console.ReadKey();
        }
    }
}



//#define SUPPORT_HSB

using System;
using System.Collections.Generic;
using System.Text;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;

namespace TestCUDAImageSearch
{
    public static class CUDAImageLIb
    {
        [DllImport("CUDAImageLib.dll")]
        private static extern int ImageSearchGPU(float[] bufferOut, float[] bufferB, float[] bufferS, int bw, int bh, int sw, int sh);

        [DllImport("CUDAImageLib.dll")]
        private static extern int FindMinCPU(float[] values, int count);

        private static int _lastKernelTime = 0;

        public static int LastKernelTime
        {
            get { return _lastKernelTime; }
        }

        public static Point ImageSearch(Bitmap big, Bitmap small)
        {
            int bw = big.Width;
            int bh = big.Height;
            int sw = small.Width;
            int sh = small.Height;
            int mx = (bw - sw);
            int my = (bh - sh);

            float[] diffs = new float[mx * my];
            float[] b = ImageToFloat(big);
            float[] s = ImageToFloat(small);
            _lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
            int minIndex = FindMinCPU(diffs, diffs.Length);
            return new Point(minIndex % mx, minIndex / mx);
        }

        public static List<Point> ImageSearch(Bitmap big, Bitmap small, float maxDeviation)
        {
            int bw = big.Width;
            int bh = big.Height;
            int sw = small.Width;
            int sh = small.Height;
            int mx = (bw - sw);
            int my = (bh - sh);
            int nDiff = mx * my;

            float[] diffs = new float[nDiff];
            float[] b = ImageToFloat(big);
            float[] s = ImageToFloat(small);
            _lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);

            List<Point> points = new List<Point>();
            for(int i = 0; i < nDiff; ++i)
            {
                if (diffs[i] < maxDeviation)
                {
                    points.Add(new Point(i % mx, i / mx));
                }
            }
            return points;
        }

#if SUPPORT_HSB

        private static float[] ImageToFloat(Bitmap img)
        {
            int w = img.Width;
            int h = img.Height;
            float[] pix = new float[w * h * 4];
            int i = 0;
            for (int y = 0; y < h; ++y)
            {
                for (int x = 0; x < w; ++x)
                {
                    Color c = img.GetPixel(x, y);
                    pix[i] = c.GetHue() / 360;                   
                    pix[i + 1] = c.GetSaturation();                
                    pix[i + 2] = c.GetBrightness();                    
                    pix[i + 3] = c.A;
                    i += 4;
                }
            }
            return pix;
        }
#else
        private static float[] ImageToFloat(Bitmap bmp)
        {
            int w = bmp.Width;
            int h = bmp.Height;
            int n = w * h;
            float[] pix = new float[n * 4];

            System.Diagnostics.Debug.Assert(bmp.PixelFormat == PixelFormat.Format32bppArgb);
            Rectangle r = new Rectangle(0, 0, w, h);
            BitmapData bmpData = bmp.LockBits(r, ImageLockMode.ReadOnly, bmp.PixelFormat);
            System.Diagnostics.Debug.Assert(bmpData.Stride > 0);
            int[] pixels = new int[n];
            System.Runtime.InteropServices.Marshal.Copy(bmpData.Scan0, pixels, 0, n);
            bmp.UnlockBits(bmpData);

            int j = 0;
            for (int i = 0; i < n; ++i)
            {
                pix[j] = (pixels[i] & 255)  / 255.0f;
                pix[j + 1] = ((pixels[i] >> 8) & 255) / 255.0f;
                pix[j + 2] = ((pixels[i] >> 16) & 255) / 255.0f;
                pix[j + 3] = ((pixels[i] >> 24) & 255) / 255.0f;
                j += 4;
            }
            return pix;
        }
#endif
    }
}

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

懒的傷心 2024-12-11 08:12:05

看起来您正在谈论的是一个众所周知的问题：模板匹配。最简单的方法是将图像（较大图像）与模板（较小图像）进行卷积。您可以通过两种方式之一实现卷积。

1) 修改 CUDA SDK 中的卷积示例（与您所做的类似）。

2）使用FFT来实现卷积。参考号卷积定理。您需要记住，

% MATLAB format
L = size(A) + size(B) - 1;
conv2(A, B) = IFFT2(FFT2(A, L) .* FFT2(B, L));

您可以使用 cufft 来实现 2 维 FFT（在适当填充它们之后）。您需要编写一个执行元素乘法的内核，然后在执行逆 FFT 之前对结果进行归一化（因为 CUFFT 不会归一化）。

对于您提到的尺寸（1024 x 1280 和 128 x 128），输入必须至少填充到（（1024 + 128 - 1）x（1280 + 128 -1）= 1151 x 1407）。但当（填充的）输入为 2 的幂时，FFT 速度最快。因此，您需要将大图像和小图像填充到大小 2048 x 2048。

Looks like what you are talking about is a well known problem: Template matching. The easiest way forward is to convolve the Image (the bigger image) with the template (the smaller image). You could implement convolutions in one of two ways.

1) Modify the convolutions example from the CUDA SDK (similar to what you are doing anyway).

2) Use FFTs to implement the convolution. Ref. Convolution theorem. You will need to remember

% MATLAB format
L = size(A) + size(B) - 1;
conv2(A, B) = IFFT2(FFT2(A, L) .* FFT2(B, L));

You could use cufft to implement the 2 dimensional FFTs (After padding them appropriately). You will need to write a kernel that does element wise multiplication and then normalizes the result (because CUFFT does not normalize) before performing the inverse FFT.

For the sizes you mention, (1024 x 1280 and 128 x 128), the inputs must be padded to atleast ((1024 + 128 - 1) x (1280 + 128 -1) = 1151 x 1407). But FFTs are fastest when the (padded) inputs are powers of 2. So you will need to pad both the large and small images to size 2048 x 2048.

回复收藏 0 原文