无论规范如何，Cuda 调用都不会为每个块分配超过 8 个线程

发布于 2024-11-03 02:30:22 字数 3393 浏览 0 评论 0原文

我正在用 C++ 创建埃拉托斯特尼筛法的并行版本。问题是我的内核调用 (reduce0) 似乎只为每个块分配 8 个线程，而不是我指定的 256 个。由于即使第一个 CUDA 版本也允许每个块 512 个线程，所以我的代码中一定存在一些错误。任何帮助将不胜感激。

#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;

////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);

////////////////////////////////////////////////////
int main(){
    int n = pow((double) 2, 8);
    int total = sieve(n);
    cout << "# primes" << endl << total << endl;
    return 0;
}
///////////////////////////////////////////////////

__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];

// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();

// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
    if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
        sdata[tid] += sdata[tid + s];
    }
    __syncthreads();
}

// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/////////////////////////////////////////////////////

int call_kernel(int *primes, int n){
    // Allocate and copy device arrays
    int *g_idevice;
    int *g_odevice;
    int size = n * sizeof(int);
    cudaMalloc(&g_idevice, size);
    cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
    cudaMalloc(&g_odevice, size);

    // Specify grid/block dimenstions and invoke the kernel
    dim3 dimGrid(1,1);
    dim3 dimBlock(256,1);
    reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);

    // Copy device data back to primes
    cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);

    //for (int i = 0; i < n; i++) {
    //  cout << i << "  " << primes[i] << endl;
    //}
    int total = primes[0];
    cudaFree(g_idevice);
    cudaFree(g_odevice);
    return total;


}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
    int i = f;
    while(arg[i]!= 1 && i < n) {
        i++;
    }
    return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
    int total = 0;
    int i = 2;
    while(i < n){
        if(arg[i] == 1){
        total = total + 1;
        }
        i++;
    }
    return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
    int* primes = NULL;
    int mult = 0;
    int k = 2;
    int i; int total;
    //primes = new int[n];
    primes = new int[256];
    for(i = 0; i < n; i++){
        primes[i] = 1;
    }
    primes[0] = primes[1] = 0;

    while (k * k < n){
        mult = k * k;
        while (mult < n) {
            primes[mult] = 0;
            mult =  mult + k;
        }
        k = findsmallest(primes,k+1, n);
    }
    total = call_kernel(primes, n);
    //delete [] primes;
    //primes = NULL;
    return total;
}

原文

I am creating a parallel version of the Sieve of Eratosthenes in c++. The problem is my kernel call (reduce0) seems to only ever assign 8 threads per block instead of the 256 I specify. Since even the first CUDA version allows 512 threads per block, there must be some error in my code for it. Any help would be appreciated.

#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;

////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);

////////////////////////////////////////////////////
int main(){
    int n = pow((double) 2, 8);
    int total = sieve(n);
    cout << "# primes" << endl << total << endl;
    return 0;
}
///////////////////////////////////////////////////

__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];

// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();

// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
    if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
        sdata[tid] += sdata[tid + s];
    }
    __syncthreads();
}

// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/////////////////////////////////////////////////////

int call_kernel(int *primes, int n){
    // Allocate and copy device arrays
    int *g_idevice;
    int *g_odevice;
    int size = n * sizeof(int);
    cudaMalloc(&g_idevice, size);
    cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
    cudaMalloc(&g_odevice, size);

    // Specify grid/block dimenstions and invoke the kernel
    dim3 dimGrid(1,1);
    dim3 dimBlock(256,1);
    reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);

    // Copy device data back to primes
    cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);

    //for (int i = 0; i < n; i++) {
    //  cout << i << "  " << primes[i] << endl;
    //}
    int total = primes[0];
    cudaFree(g_idevice);
    cudaFree(g_odevice);
    return total;


}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
    int i = f;
    while(arg[i]!= 1 && i < n) {
        i++;
    }
    return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
    int total = 0;
    int i = 2;
    while(i < n){
        if(arg[i] == 1){
        total = total + 1;
        }
        i++;
    }
    return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
    int* primes = NULL;
    int mult = 0;
    int k = 2;
    int i; int total;
    //primes = new int[n];
    primes = new int[256];
    for(i = 0; i < n; i++){
        primes[i] = 1;
    }
    primes[0] = primes[1] = 0;

    while (k * k < n){
        mult = k * k;
        while (mult < n) {
            primes[mult] = 0;
            mult =  mult + k;
        }
        k = findsmallest(primes,k+1, n);
    }
    total = call_kernel(primes, n);
    //delete [] primes;
    //primes = NULL;
    return total;
}

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

可可 2024-11-10 02:30:22

您的内核正在使用动态分配的共享内存，但内核启动不包含任何分配，因此结果是内核将由于该共享内存缓冲区上的非法内存操作而中止。如果您按如下方式修改 call_kernel 的这一部分，您应该会发现它可以工作：

// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);

如果您在函数调用周围包含了一些基本的错误检查，也许像这样：

reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
    cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}

// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
    cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}

内核启动或执行失败并出现错误将立即显而易见。

Your kernel is using dynamically allocated shared memory, but the kernel launch does not include any allocation, so the result is the kernel will be aborting because of illegal memory operations on that shared memory buffer. You should find it works if you modify this part of call_kernel as follows:

// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);

If you had of included some basic error checking around the function call, perhaps like this:

reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
    cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}

// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
    cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}

it would have been immediately obvious that the kernel launch or execution was failing with an error.

回复收藏 0 原文

~没有更多了~