错误：CUDA 同步上的 BFS

发布于 2024-12-10 16:21:38 字数 2927 浏览 0 评论 0原文

我的以下代码出现错误，当它运行时，一些图权重被覆盖，但是 Xa 数组（它保留已访问过的数组）和 __syncthreads() 函数不应该发生这种情况...有人帮忙吗？

struct Node 
{
    int begin;     // begining of the substring
    int num;    // size of the sub-string 
};

__global__ void BFS (Node *Va, int *Ea, bool *Fa, bool *Xa, int *Ca, bool *parada)
{
    int tid = threadIdx.x;

    if (Fa[tid] == true && Xa[tid] == false)
    {
        Fa[tid] = false; 
        __syncthreads();

        // Va begin is where it's edges' subarray begins, Va is it's
        // number of elements
        for (int i = Va[tid].begin;  i < (Va[tid].begin + Va[tid].num); i++) 
        {           
            int nid = Ea[i];

            if (Xa[nid] == false)
            {
                Ca[nid] = Ca[tid] + 1;
                Fa[nid] = true;
                *parada = true;
            }   
        }    
        Xa[tid] = true;             
    }
}

// The BFS frontier corresponds to all the nodes being processed 
// at the current level.
int main()
{

    //descrição do grafo
    struct Node node[4]; 
    node[0].begin=0; 
    node[0].num=2; 
    node[1].begin=1; 
    node[1].num=0; 
    node[2].begin=2; 
    node[2].num=2; 
    node[3].begin=1; 
    node[3].num=0; 
    int edges[]={1,2,3,1}; 

    bool frontier[4]={false}; 
    bool visited[4]={false}; 
    int custo[4]={0}; 

    int source=0; 
    frontier[source]=true; 

    Node* Va; 
    cudaMalloc((void**)&Va,sizeof(Node)*4); 
    cudaMemcpy(Va,node,sizeof(Node)*4,cudaMemcpyHostToDevice); 

    int* Ea; 
    cudaMalloc((void**)&Ea,sizeof(Node)*4); 
    cudaMemcpy(Ea,edges,sizeof(Node)*4,cudaMemcpyHostToDevice); 

    bool* Fa; 
    cudaMalloc((void**)&Fa,sizeof(bool)*4); 
    cudaMemcpy(Fa,frontier,sizeof(bool)*4,cudaMemcpyHostToDevice); 

    bool* Xa; 
    cudaMalloc((void**)&Xa,sizeof(bool)*4); 
    cudaMemcpy(Xa,visited,sizeof(bool)*4,cudaMemcpyHostToDevice); 

    int* Ca; 
    cudaMalloc((void**)&Ca,sizeof(int)*4); 
    cudaMemcpy(Ca,custo,sizeof(int)*4,cudaMemcpyHostToDevice); 

    dim3 threads(4,1,1); 

    bool para; 
    bool* parada; 
    cudaMalloc((void**)&parada,sizeof(bool)); 
    printf("\n");
    int n=1;
    do{ 
        para=false; 
        cudaMemcpy(parada,&para,sizeof(bool),cudaMemcpyHostToDevice);       
        BFS <<<1,threads>>>(Va,Ea,Fa,Xa,Ca,parada);     
        CUT_CHECK_ERROR("kernel1 execution failed"); 
        cudaMemcpy(&para,parada,sizeof(bool),cudaMemcpyDeviceToHost); 



        printf("Run number: %d >> ",n); 
        cudaMemcpy(custo,Ca,sizeof(int)*4,cudaMemcpyDeviceToHost);  
        for(int i=0;i<4;i++) 
            printf("%d  ",custo[i]); 
        printf("\n");
        n++;

    }while(para); 


    printf("\nFinal:\n");
    cudaMemcpy(custo,Ca,sizeof(int)*4,cudaMemcpyDeviceToHost); 

    for(int i=0;i<4;i++) 
        printf("%d  ",custo[i]); 
    printf("\n");

}

原文

My following code got an error, when it runs, some of the graph weights are being overwritten, but that should not be happening with the Xa array(which keeps which ones have already been visited) and the __syncthreads() function... May someone help?

struct Node 
{
    int begin;     // begining of the substring
    int num;    // size of the sub-string 
};

__global__ void BFS (Node *Va, int *Ea, bool *Fa, bool *Xa, int *Ca, bool *parada)
{
    int tid = threadIdx.x;

    if (Fa[tid] == true && Xa[tid] == false)
    {
        Fa[tid] = false; 
        __syncthreads();

        // Va begin is where it's edges' subarray begins, Va is it's
        // number of elements
        for (int i = Va[tid].begin;  i < (Va[tid].begin + Va[tid].num); i++) 
        {           
            int nid = Ea[i];

            if (Xa[nid] == false)
            {
                Ca[nid] = Ca[tid] + 1;
                Fa[nid] = true;
                *parada = true;
            }   
        }    
        Xa[tid] = true;             
    }
}

// The BFS frontier corresponds to all the nodes being processed 
// at the current level.
int main()
{

    //descrição do grafo
    struct Node node[4]; 
    node[0].begin=0; 
    node[0].num=2; 
    node[1].begin=1; 
    node[1].num=0; 
    node[2].begin=2; 
    node[2].num=2; 
    node[3].begin=1; 
    node[3].num=0; 
    int edges[]={1,2,3,1}; 

    bool frontier[4]={false}; 
    bool visited[4]={false}; 
    int custo[4]={0}; 

    int source=0; 
    frontier[source]=true; 

    Node* Va; 
    cudaMalloc((void**)&Va,sizeof(Node)*4); 
    cudaMemcpy(Va,node,sizeof(Node)*4,cudaMemcpyHostToDevice); 

    int* Ea; 
    cudaMalloc((void**)&Ea,sizeof(Node)*4); 
    cudaMemcpy(Ea,edges,sizeof(Node)*4,cudaMemcpyHostToDevice); 

    bool* Fa; 
    cudaMalloc((void**)&Fa,sizeof(bool)*4); 
    cudaMemcpy(Fa,frontier,sizeof(bool)*4,cudaMemcpyHostToDevice); 

    bool* Xa; 
    cudaMalloc((void**)&Xa,sizeof(bool)*4); 
    cudaMemcpy(Xa,visited,sizeof(bool)*4,cudaMemcpyHostToDevice); 

    int* Ca; 
    cudaMalloc((void**)&Ca,sizeof(int)*4); 
    cudaMemcpy(Ca,custo,sizeof(int)*4,cudaMemcpyHostToDevice); 

    dim3 threads(4,1,1); 

    bool para; 
    bool* parada; 
    cudaMalloc((void**)¶da,sizeof(bool)); 
    printf("\n");
    int n=1;
    do{ 
        para=false; 
        cudaMemcpy(parada,¶,sizeof(bool),cudaMemcpyHostToDevice);       
        BFS <<<1,threads>>>(Va,Ea,Fa,Xa,Ca,parada);     
        CUT_CHECK_ERROR("kernel1 execution failed"); 
        cudaMemcpy(¶,parada,sizeof(bool),cudaMemcpyDeviceToHost); 



        printf("Run number: %d >> ",n); 
        cudaMemcpy(custo,Ca,sizeof(int)*4,cudaMemcpyDeviceToHost);  
        for(int i=0;i<4;i++) 
            printf("%d  ",custo[i]); 
        printf("\n");
        n++;

    }while(para); 


    printf("\nFinal:\n");
    cudaMemcpy(custo,Ca,sizeof(int)*4,cudaMemcpyDeviceToHost); 

    for(int i=0;i<4;i++) 
        printf("%d  ",custo[i]); 
    printf("\n");

}

分享到QQ

分享到微博