Recreational

发布于 2025-02-25 23:44:05 字数 5796 浏览 0 评论 0 收藏 0

We will plot the famous Madnelbrot fractal and compare the code for and run times of a pure Pythoo with a GPU version.

Pure Python

# color function for point at (x, y)
def mandel(x, y, max_iters):
    c = complex(x, y)
    z = 0.0j
    for i in range(max_iters):
        z = z*z + c
        if z.real*z.real + z.imag*z.imag >= 4:
            return i
    return max_iters

def create_fractal(xmin, xmax, ymin, ymax, image, iters):
    height, width = image.shape

    pixel_size_x = (xmax - xmin)/width
    pixel_size_y = (ymax - ymin)/height

    for x in range(width):
        real = xmin + x*pixel_size_x
        for y in range(height):
            imag = ymin + y*pixel_size_y
            color = mandel(real, imag, iters)
            image[y, x]  = color

gimage = np.zeros((1024, 1536), dtype=np.uint8)
xmin, xmax, ymin, ymax = np.array([-2.0, 1.0, -1.0, 1.0]).astype('float32')
iters = 50

start = timer()
create_fractal(xmin, xmax, ymin, ymax, gimage, iters)
dt = timer() - start

print "Mandelbrot created on CPU in %f s" % dt
plt.imshow(gimage);

Mandelbrot created on CPU in 34.773193 s

Numba

# Reuse regular function on GUO by using jit decorator
# This is using the jit decorator as a function (to avoid copying and pasting code)
import numba
mandel_numba = numba.jit(restype=uint32, argtypes=[float32, float32, uint32])(mandel)

@numba.jit
def create_fractal_numba(xmin, xmax, ymin, ymax, image, iters):
    height, width = image.shape

    pixel_size_x = (xmax - xmin)/width
    pixel_size_y = (ymax - ymin)/height

    for x in range(width):
        real = xmin + x*pixel_size_x
        for y in range(height):
            imag = ymin + y*pixel_size_y
            color = mandel_numba(real, imag, iters)
            image[y, x]  = color

gimage = np.zeros((1024, 1536), dtype=np.uint8)
xmin, xmax, ymin, ymax = np.array([-2.0, 1.0, -1.0, 1.0]).astype('float32')
iters = 50

start = timer()
create_fractal_numba(xmin, xmax, ymin, ymax, gimage, iters)
dt = timer() - start

print "Mandelbrot created on CPU in %f s" % dt
plt.imshow(gimage);

Mandelbrot created on CPU in 0.339179 s

CUDA

# Reuse regular function on GUO by using jit decorator
mandel_gpu = cuda.jit(restype=uint32, argtypes=[float32, float32, uint32], device=True)(mandel)

@cuda.jit(argtypes=[float32, float32, float32, float32, uint8[:,:], uint32])
def create_fractal_kernel(xmin, xmax, ymin, ymax, image, iters):
    height, width = image.shape

    pixel_size_x = (xmax - xmin)/width
    pixel_size_y = (ymax - ymin)/height

    startX, startY = cuda.grid(2)
    gridX = cuda.gridDim.x * cuda.blockDim.x # stride in x
    gridY = cuda.gridDim.y * cuda.blockDim.y # stride in y

    for x in range(startX, width, gridX):
        real = xmin + x*pixel_size_x
        for y in range(startY, height, gridY):
            imag = ymin + y*pixel_size_y
            color = mandel_gpu(real, imag, iters)
            image[y, x]  = color

gimage = np.zeros((1024, 1536), dtype=np.uint8)
blockdim = (32, 8)
griddim = (32, 16)
xmin, xmax, ymin, ymax = np.array([-2.0, 1.0, -1.0, 1.0]).astype('float32')
iters = 50

start = timer()
d_image = cuda.to_device(gimage)
create_fractal_kernel[griddim, blockdim](xmin, xmax, ymin, ymax, d_image, iters)
d_image.to_host()
dt = timer() - start

print "Mandelbrot created on GPU in %f s" % dt
plt.imshow(gimage);

Mandelbrot created on GPU in 0.010257 s

Using CUDA liraries

See documentation at http://docs.continuum.io/numbapro/cudalib.html

Matrix multiplication wiht `cublas`

import numbapro.cudalib.cublas as cublas
blas = cublas.Blas()

n =100
A = np.random.random((n, n)).astype(np.float32)
B = np.random.random((n, n)).astype(np.float32)
C = np.zeros_like(A, order='F')

blas.gemm('T', 'T', n, n, n, 1.0, A, B, 1.0, C)

assert(np.allclose(np.dot(A, B), C))

Random numbers with `curand`

from numbapro.cudalib import curand
prng = curand.PRNG()
prng.seed = 123

@vectorize('float32(float32)', target='gpu')
def shift(x):
    return x*2 - 1

n = 1e7
x = np.empty(n*2).astype(np.float32)
prng.uniform(x)
r = shift(x).reshape((n, 2))
pi_hat =4*(r[:,0]**2 + r[:,1]**2 < 1).sum()/n
pi_hat

3.1409

FFT and IFFT

import numbapro.cudalib.cufft as cufft

num = 4
v = np.random.normal(0, 1, (num, 2))
z = v[:,0] + 1j*v[:,1]
print "{:<20}".format('Original'), z

x_gpu = np.zeros(num, dtype='complex')
cufft.fft(z, x_gpu)
print "{:<20}".format('CUDA FFT'), x_gpu

x_cpu = np.fft.fft(z)
print "{:<20}".format('CPU  FFT'), x_cpu

# NVidia IFFT returns unnormalzied results
cufft.ifft(x_gpu, z)
print "{:<20}".format('CUDA IFFT'), z/num

x_cpu = np.fft.ifft(x_cpu)
print "{:<20}".format('CPU  IFFT'), x_cpu

Original             [ 0.8236-0.564j   0.0743-1.0426j  0.3215+1.0885j -0.7250-1.7846j]
CUDA FFT             [ 0.4944-2.3028j  1.2440-2.4518j  1.7958+3.3518j -0.2400-0.853j ]
CPU  FFT             [ 0.4944-2.3028j  1.2440-2.4518j  1.7958+3.3518j -0.2400-0.853j ]
CUDA IFFT            [ 0.8236-0.564j   0.0743-1.0426j  0.3215+1.0885j -0.7250-1.7846j]
CPU  IFFT            [ 0.8236-0.564j   0.0743-1.0426j  0.3215+1.0885j -0.7250-1.7846j]

分享到QQ

分享到微博