More examples

发布于 2025-02-25 23:44:05 字数 5949 浏览 0 评论 0 收藏 0

Grids, blocks and threads
Maximum size of block is 512 or 1024 threads, depending on GPU
Get around by using many blocks of threads to partition matrix computataions
Full matrix divided into tiles
See Figure below

Image(url="http://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/matrix-multiplication-with-shared-memory.png")

x1 = np.random.random((4,4))
x2 = np.random.random((4,4))
np.dot(x1, x2).shape

(4, 4)

Kernel function (no shared memory)

@cuda.jit('void(float32[:,:], float32[:,:], float32[:,:], int32)')
def cu_matmul(a, b, c, n):
    x, y = cuda.grid(2)

    if (x >= n) or (y >= n):
        return

    c[x, y] = 0
    for i in range(n):
        c[x, y] +=  a[x, i] * b[i, y]

tpb = device.WARP_SIZE
n = 400
bpg = (n+tpb-1)//tpb
grid_dim = (bpg, bpg)
block_dim = (tpb, tpb)

A = np.random.random((n, n)).astype(np.float32)
B = np.random.random((n, n)).astype(np.float32)
C = np.empty((n, n), dtype=np.float32)
cu_matmul[grid_dim, block_dim](A, B, C, n)
assert(np.allclose(np.dot(A, B), C))

Matrix multiply with shared memory

Memmory access speed * Local to thread * Shared among block of threads * Global (much slower than shared) * Host

Want to push memory access as close to threads as possible. In practice, the challenge is usually to structure the program in such a way that shared mmeory use is optimized.

Image(url="http://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/memory-hierarchy.png")

Using shared mmeory by using tiling to exploit locality

Image(url="http://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/matrix-multiplication-with-shared-memory.png")

Kernel function (with shared memory)

tpb = device.WARP_SIZE
block_dim = (tpb, tpb)

@cuda.jit('void(float32[:,:], float32[:,:], float32[:,:], int32, int32, int32)')
def cu_matmul_sm(A, B, C, n, tpb, bpg):
    # decalre shared memory
    sA = cuda.shared.array(shape=block_dim, dtype=float32)
    sB = cuda.shared.array(shape=block_dim, dtype=float32)

    # we now need the thread ID within a block as well as the global thread ID
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    x, y = cuda.grid(2)

    # pefort partial operations in block-szied tiles
    # saving intermediate values in an accumulator variable
    acc = 0.0
    for i in range(bpg):
        # Stage 1: Prefil shared memory with current block from matrix A and matrix B
        sA[tx, ty] = A[x, ty + i * tpb]
        sB[tx, ty] = B[tx + i * tpb, y]

        # Block calculations till shared mmeory is filled
        cuda.syncthreads()

        # Stage 2: Compute partial dot product and add to accumulator
        if x < n and y < n:
            for j in range(tpb):
                acc += sA[tx, j] * sB[j, ty]

        # Blcok until all threads have completed calcuaiton before next loop iteration
        cuda.syncthreads()

    # Put accumulated dot product into output matrix
    if x < n and y < n:
        C[x, y] = acc

k = 32
n = tpb * k # n must be multiple of tpb because shared memory is not initialized to zero
bpg = n//tpb
grid_dim = (bpg, bpg)

A = np.random.random((n, n)).astype(np.float32)
B = np.random.random((n, n)).astype(np.float32)
C = np.empty((n, n), dtype=np.float32)
cu_matmul_sm[grid_dim, block_dim](A, B, C, n, tpb, bpg)
assert(np.allclose(np.dot(A, B), C))

Benchmark

k = 8
n = tpb * k
bpg = n//tpb
grid_dim = (bpg, bpg)

# Prepare data on the CPU
A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)
C = np.zeros_like(A)

print "N = %d x %d" % (n, n)

# Prepare data on the GPU
dA = cuda.to_device(A)
dB = cuda.to_device(B)
dC = cuda.to_device(C) # device_array_like(A)

# Time numpy version
s = timer()
np_ans = np.dot(A, B)
e = timer()
t = e - s

# Time the unoptimized version
s = timer()
cu_matmul[grid_dim, block_dim](dA, dB, dC, n)
cuda.synchronize()
e = timer()
unopt_ans = dC.copy_to_host()
tcuda_unopt = e - s

# Time the shared memory version
s = timer()
cu_matmul_sm[grid_dim, block_dim](dA, dB, dC, n, tpb, bpg)
cuda.synchronize()
e = timer()
opt_ans = dC.copy_to_host()
tcuda_opt = e - s

# Time for CuBLAS version
s = timer()
blas.gemm('T', 'T', n, n, n, 1.0, A, B, 1.0, C) # A, B not in fortran order so need for transpose
e = timer()
blas_ans = dC.copy_to_host()
tcuda_blas = e - s

print "Using numpy.dot:", "%.2f" % t, "s"
print "Without shared memory:", "%.2f" % tcuda_unopt, "s"
print "With shared memory:", "%.2f" % tcuda_opt, "s"
print "Using CuBLAS:", "%.2f" % tcuda_blas, "s"

N = 256 x 256
Using numpy.dot: 0.00 s
Without shared memory: 0.01 s
With shared memory: 0.00 s
Using CuBLAS: 0.00 s

assert np.allclose(np_ans, unopt_ans)
assert np.allclose(np_ans, opt_ans)
assert np.allclose(np_ans, blas_ans)

%load_ext version_information

%version_information numpy, pandas, numba, numbapro

Software	Version
Python	2.7.9 64bit [GCC 4.2.1 (Apple Inc. build 5577)]
IPython	2.2.0
OS	Darwin 13.4.0 x86_64 i386 64bit
numpy	1.9.2
pandas	0.14.1
numba	0.17.0
numbapro	0.17.1
Sun Mar 29 19:32:55 2015 EDT