CUDA 中的位数组

发布于 2024-10-08 02:15:26 字数 1745 浏览 1 评论 0原文

我正在 CUDA 中实现埃拉托斯特尼筛法，并且有一个非常奇怪的输出。我使用 unsigned char* 作为数据结构，并使用以下宏来操作位。

#define ISBITSET(x,i) ((x[i>>3] & (1<<(i&7)))!=0)
#define SETBIT(x,i) x[i>>3]|=(1<<(i&7));
#define CLEARBIT(x,i) x[i>>3]&=(1<<(i&7))^0xFF;

我设置该位来表示它是素数，否则它 = 0。这是我调用我的内核的地方

size_t p=3;
size_t primeTill = 30;

while(p*p<=primeTill)
{
    if(ISBITSET(h_a, p) == 1){
        int dimA = 30;
        int numBlocks = 1;
        int numThreadsPerBlock = dimA;
        dim3 dimGrid(numBlocks);
        dim3 dimBlock(numThreadsPerBlock);
        cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );        
        cudaThreadSynchronize();    
        reverseArrayBlock<<< dimGrid, dimBlock >>>( d_a, primeTill, p );
        cudaThreadSynchronize();    
        cudaMemcpy( h_a, d_a, memSize, cudaMemcpyDeviceToHost );
        cudaThreadSynchronize();    
        printf("This is after removing multiples of %d\n", p);
        //Loop
        for(size_t i = 0; i < primeTill +1; i++)
        {
            printf("Bit %d is %d\n", i, ISBITSET(h_a, i));
        }
    }           
    p++;
}

这是我的内核

__global__ void reverseArrayBlock(unsigned char *d_out, int size, size_t p)
{
int id = blockIdx.x*blockDim.x + threadIdx.x;
int r = id*p;
if(id >= p && r <= size )
{
    while(ISBITSET(d_out, r ) == 1 ){
        CLEARBIT(d_out, r);
    }

    // if(r == 9)
    // {
    //  /* code */
    //  CLEARBIT(d_out, 9);
    // }

}

} 输出应该是： 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 而我的输出是： 2, 3, 5, 9, 7, 11, 13, 17, 19, 23, 29

如果你看一下内核代码，如果我取消注释这些行，我将得到正确的答案，这意味着没有任何问题用我的循环或我的检查！

原文

Am implementing Sieve of Eratosthenes in CUDA and am having a very weird output. Am using unsigned char* as the data structure and using the following macros to manipulate the bits.

#define ISBITSET(x,i) ((x[i>>3] & (1<<(i&7)))!=0)
#define SETBIT(x,i) x[i>>3]|=(1<<(i&7));
#define CLEARBIT(x,i) x[i>>3]&=(1<<(i&7))^0xFF;

I set the bit to denote it's a prime number, otherwise it's = 0.
Here is where i call my kernel

size_t p=3;
size_t primeTill = 30;

while(p*p<=primeTill)
{
    if(ISBITSET(h_a, p) == 1){
        int dimA = 30;
        int numBlocks = 1;
        int numThreadsPerBlock = dimA;
        dim3 dimGrid(numBlocks);
        dim3 dimBlock(numThreadsPerBlock);
        cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );        
        cudaThreadSynchronize();    
        reverseArrayBlock<<< dimGrid, dimBlock >>>( d_a, primeTill, p );
        cudaThreadSynchronize();    
        cudaMemcpy( h_a, d_a, memSize, cudaMemcpyDeviceToHost );
        cudaThreadSynchronize();    
        printf("This is after removing multiples of %d\n", p);
        //Loop
        for(size_t i = 0; i < primeTill +1; i++)
        {
            printf("Bit %d is %d\n", i, ISBITSET(h_a, i));
        }
    }           
    p++;
}

Here is my kernel

__global__ void reverseArrayBlock(unsigned char *d_out, int size, size_t p)
{
int id = blockIdx.x*blockDim.x + threadIdx.x;
int r = id*p;
if(id >= p && r <= size )
{
    while(ISBITSET(d_out, r ) == 1 ){
        CLEARBIT(d_out, r);
    }

    // if(r == 9)
    // {
    //  /* code */
    //  CLEARBIT(d_out, 9);
    // }

}

}
The output should be:
2, 3, 5, 7, 11, 13, 17, 19, 23, 29
while my output is:
2, 3, 5, 9, 7, 11, 13, 17, 19, 23, 29

If you take a look at the kernel code, if i uncomment those lines i will get the correct answer, which means that there is nothing wrong with my loops or my checking!

分享到QQ

分享到微博