Vector addition - the ‘Hello, world’ of CUDA

发布于 2025-02-25 23:44:04 字数 7970 浏览 0 评论 0 收藏 0

Version 1 of the kernel

This version does everything explicitly and is essentially what needs to be done in CUDA C.

@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add1(a, b, c):
    """This kernel function will be executed by a thread."""
    bx = cuda.blockIdx.x # which block in the grid?
    bw = cuda.blockDim.x # what is the size of a block?
    tx = cuda.threadIdx.x # unique thread ID within a blcok
    i = tx + bx * bw

    if i > c.size:
        return

    c[i] = a[i] + b[i]

Launching the kernel

device = cuda.get_current_device()

n = 100

# Host memory
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)

# Assign equivalent storage on device
da = cuda.to_device(a)
db = cuda.to_device(b)

# Assign storage on device for output
dc = cuda.device_array_like(a)

# Set up enough threads for kernel
tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
print 'Blocks per grid:', bpg
print 'Threads per block', tpb

# Launch kernel
cu_add1[bpg, tpb](da, db, dc)

# Transfer output from device to host
c = dc.copy_to_host()

print c

Blocks per grid: 4
Threads per block 32
[   0.    2.    4.    6.    8.   10.   12.   14.   16.   18.   20.   22.
   24.   26.   28.   30.   32.   34.   36.   38.   40.   42.   44.   46.
   48.   50.   52.   54.   56.   58.   60.   62.   64.   66.   68.   70.
   72.   74.   76.   78.   80.   82.   84.   86.   88.   90.   92.   94.
   96.   98.  100.  102.  104.  106.  108.  110.  112.  114.  116.  118.
  120.  122.  124.  126.  128.  130.  132.  134.  136.  138.  140.  142.
  144.  146.  148.  150.  152.  154.  156.  158.  160.  162.  164.  166.
  168.  170.  172.  174.  176.  178.  180.  182.  184.  186.  188.  190.
  192.  194.  196.  198.]

Version 2 of the kernel

This version makes use of the dynamic nature of Python to eliminate a lot of boilerplate code.

@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add2(a, b, c):
    """This kernel function will be executed by a thread."""
    i  = cuda.grid(1)

    if i > c.shape[0]:
        return

    c[i] = a[i] + b[i]

Launching the kernel

device = cuda.get_current_device()

n = 100
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)
c = np.empty_like(a)

tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
print 'Blocks per grid:', bpg
print 'Threads per block', tpb

cu_add2[bpg, tpb](a, b, c)
print c

Blocks per grid: 4
Threads per block 32
[   0.    2.    4.    6.    8.   10.   12.   14.   16.   18.   20.   22.
   24.   26.   28.   30.   32.   34.   36.   38.   40.   42.   44.   46.
   48.   50.   52.   54.   56.   58.   60.   62.   64.   66.   68.   70.
   72.   74.   76.   78.   80.   82.   84.   86.   88.   90.   92.   94.
   96.   98.  100.  102.  104.  106.  108.  110.  112.  114.  116.  118.
  120.  122.  124.  126.  128.  130.  132.  134.  136.  138.  140.  142.
  144.  146.  148.  150.  152.  154.  156.  158.  160.  162.  164.  166.
  168.  170.  172.  174.  176.  178.  180.  182.  184.  186.  188.  190.
  192.  194.  196.  198.]

Vector addition with the `vectorize` decorator

@vectorize(['int64(int64, int64)',
            'float32(float32, float32)',
            'float64(float64, float64)'],
           target='gpu')
def cu_add(a, b):
    return a + b

n = 100
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)

c = cu_add(a, b)
print c

[   0.    2.    4.    6.    8.   10.   12.   14.   16.   18.   20.   22.
   24.   26.   28.   30.   32.   34.   36.   38.   40.   42.   44.   46.
   48.   50.   52.   54.   56.   58.   60.   62.   64.   66.   68.   70.
   72.   74.   76.   78.   80.   82.   84.   86.   88.   90.   92.   94.
   96.   98.  100.  102.  104.  106.  108.  110.  112.  114.  116.  118.
  120.  122.  124.  126.  128.  130.  132.  134.  136.  138.  140.  142.
  144.  146.  148.  150.  152.  154.  156.  158.  160.  162.  164.  166.
  168.  170.  172.  174.  176.  178.  180.  182.  184.  186.  188.  190.
  192.  194.  196.  198.]

2D version

@cuda.jit('void(float32[:,:], float32[:,:], float32[:,:])')
def cu_add_2d(a, b, c):
    """This kernel function will be executed by a thread."""
    i, j  = cuda.grid(2)

    if (i < c.shape[0]) and (j < c.shape[1]):
        c[i, j] = a[i, j] + b[i, j]
    cuda.syncthreads()

Low level `cuda.jit` requires correct instantiation of the kernel with blockspergrid and threadsperblock

device = cuda.get_current_device()

n = 480
p = 320
a = np.random.random((n, p)).astype(np.float32)
b = np.ones((n, p)).astype(np.float32)
c = np.empty_like(a)

threadsperblock = (16, 16)
blockspergrid_x = (n + threadsperblock[0]) // threadsperblock[0]
blockspergrid_y = (p + threadsperblock[1]) // threadsperblock[1]
blockspergrid = (blockspergrid_x, blockspergrid_y)

print blockspergrid, threadsperblock

cu_add_2d[blockspergrid, threadsperblock](a, b, c)
print a[-5:, -5:]
print b[-5:, -5:]
print c[-5:, -5:]

(31, 21) (16, 16)
[[ 0.5805  0.1855  0.956   0.6484  0.6058]
 [ 0.1826  0.969   0.0568  0.0099  0.8153]
 [ 0.976   0.5761  0.7721  0.8327  0.1189]
 [ 0.3401  0.6968  0.7493  0.8439  0.3382]
 [ 0.0203  0.541   0.5694  0.4623  0.5394]]
[[ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]]
[[ 1.5805  1.1855  1.956   1.6484  1.6058]
 [ 1.1826  1.969   1.0568  1.0099  1.8153]
 [ 1.976   1.5761  1.7721  1.8327  1.1189]
 [ 1.3401  1.6968  1.7493  1.8439  1.3382]
 [ 1.0203  1.541   1.5694  1.4623  1.5394]]

Using `vectorize`

Note that it is exactly the same function as the 1D version! And it takes care of how many blocks per grid, threads per block calcuations for you.

@vectorize(['int64(int64, int64)',
            'float32(float32, float32)',
            'float64(float64, float64)'],
           target='gpu')
def cu_vec_add_2d(a, b):
    return a + b

n = 480
p = 320
a = np.random.random((n, p)).astype(np.float32)
b = np.ones((n, p)).astype(np.float32)

c= cu_vec_add_2d(a, b)

print a[-5:, -5:]
print b[-5:, -5:]
print c[-5:, -5:]

[[ 0.0103  0.1075  0.248   0.9841  0.6077]
 [ 0.2986  0.8319  0.9616  0.037   0.4071]
 [ 0.3979  0.1994  0.6463  0.035   0.0368]
 [ 0.3706  0.879   0.7187  0.5635  0.4726]
 [ 0.4652  0.2049  0.6163  0.0255  0.8036]]
[[ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]]
[[ 1.0103  1.1075  1.248   1.9841  1.6077]
 [ 1.2986  1.8319  1.9616  1.037   1.4071]
 [ 1.3979  1.1994  1.6463  1.035   1.0368]
 [ 1.3706  1.879   1.7187  1.5635  1.4726]
 [ 1.4652  1.2049  1.6163  1.0255  1.8036]]

Switching execution target

One advantage of the high-level vectorize decorator is that the funciton code will run without any change on a single core, multiple cores or GPU by simply chaning the target. This can be used to run the apprropriate code depending on problem type and size, or as a fallback on machines that lack a GPU.

# run in parallel on mulitple CPU cores by changing target
@vectorize(['int64(int64, int64)',
            'float64(float32, float32)',
            'float64(float64, float64)'],
           target='parallel')
def mc_add(a, b):
    return a + b

mc_add(a, b)

array([[ 1.5631,  1.3817,  1.2615, ...,  1.3443,  1.8109,  1.4728],
       [ 1.1671,  1.0367,  1.7714, ...,  1.0079,  1.5834,  1.6367],
       [ 1.2247,  1.0565,  1.221 , ...,  1.2337,  1.884 ,  1.4036],
       ...,
       [ 1.5096,  1.8178,  1.1805, ...,  1.6463,  1.035 ,  1.0368],
       [ 1.6514,  1.8149,  1.7942, ...,  1.7187,  1.5635,  1.4726],
       [ 1.8826,  1.9288,  1.6108, ...,  1.6163,  1.0255,  1.8036]])

分享到QQ

分享到微博