CUDA BFS 巨图(seg.fault)

发布于 2024-12-14 06:32:45 字数 6887 浏览 5 评论 0原文

我正在 CUDA 上对 BFS 算法进行测试(我知道它存在一些同步问题,但无论如何测试它是我工作的一部分),但我在使用(或创建?)1M+ 大小的图表时遇到问题。

这是我用来创建它们的代码:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>


#define GRAPHSIZE 1000000

struct Node 
{
    int begin;     // comeco da sub-string de vizinhos
    int num;    // tamanho da sub-string de vizinhos
};



int getSize()
{
int size,arcs;

    printf("Size of the graph: \nNodes:\n>>");
    scanf ("%d", &size);

return size;
}



void createEdges(int graphSize, int* Edges)
{

int j,value, aux, rndIdx;

int edgesSize = 2*GRAPHSIZE;

srand(time(NULL));



printf ("\nGS : %d\n", graphSize);

j = 1;

    for (int i=0; i < edgesSize; i++) //first it creates an ordered array of edges
    {

        if (j < GRAPHSIZE)
        {
        Edges [i] = j;
        j++;
        }
            else
            {
            j=1;        
            Edges [i] = j; 
            j++;
            }

    }



    for (int i=0; i < edgesSize; i++) //now, it randomly swaps the edges array
    {

    rndIdx = rand()%graphSize;

    aux = Edges[rndIdx];
    Edges[rndIdx] = Edges [i];
    Edges [i] = aux;

    }

} 


int main ()
{

int size,graphAtts[2];

int edgesSize = 2*GRAPHSIZE;

int Edges[edgesSize];

struct Node node[GRAPHSIZE];

FILE *file;



printf("____________________________\nRandom graph generator in compact format, optmized for CUDA algorithms by Ianuarivs Severvs.\nFor details about this format read the README. \n");

//size = getSize(graphAtts);

//printf ("%d,%d",size,arcs);

createEdges(GRAPHSIZE,Edges); // or size?

/*
    for (int i = 0; i < edgesSize ; i ++)
    {
    printf ("-- %d --", Edges[i]);
    }   

*/

printf("\nEdges:\n");
for (int i=0; i < edgesSize; i++) 
printf("%d,",Edges[i]);


    for (int i=0,j=0 ; i < GRAPHSIZE; i++,j+=2) // now, completes the graph
    {
    node[i].begin=j;
    node[i].num=2;
    printf ("\n node %d : begin = %d, num = 2",i,j); 
    }

printf("\n");

//writes file:
file = fopen ("graph1M.g","wb");
fwrite (&Edges, edgesSize * sizeof(int),1,file);
fwrite (&node, GRAPHSIZE * sizeof(struct Node),1,file);
fclose(file);


    for (int i = 0; i < edgesSize ; i ++)
    {
    printf ("-- %d --", Edges[i]);
    }   


    for (int i = 0; i < GRAPHSIZE ; i ++)
    {
    printf ("* %d *", i);
    }


}

这是我的 BFS 代码(在 CUDA 上):

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <cutil.h> 

#define GRAPHSIZE 1000000

struct Node 
{
    int begin;     // begining of the substring
    int num;    // size of the sub-string 
};

__global__ void BFS (Node *Va, int *Ea, bool *Fa, bool *Xa, int *Ca, bool *parada) // memory races on both Xa and Ca
{

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid > GRAPHSIZE)
           *parada=true;


        if (Fa[tid] == true && Xa[tid] == false)
        {
        Fa[tid] = false; 
        Xa[tid] = true;
        //__syncthreads(); // this solves the memrace problem as long as the threads are all on the same block
                   for (int i = Va[tid].begin;  i < (Va[tid].begin + Va[tid].num); i++) // Va begin is where it's edges' subarray begins, Va is it's                                                                        number of elements
            {             
                int nid = Ea[i];

                if (Xa[nid] == false)
                {
                Ca[nid] = Ca[tid] + 1;               
                Fa[nid] = true;             
                *parada = true;
                }

            }                   

        }

}

// The BFS frontier corresponds to all the nodes being processed at the current level.


int main()
{

    // for the time couting:
    cudaEvent_t start, stop;
    float time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    FILE * file;

    printf("\nLoading graph file...\n");


    struct Node node[GRAPHSIZE];
    int edgesSize = 2*GRAPHSIZE;
    int edges[edgesSize];


    file = fopen ("graph1M.g","rb");
    printf("abriu");    
    fread (&edges, edgesSize * sizeof(int),1,file);
    fread (&node, GRAPHSIZE * sizeof(struct Node),1,file);
    fclose(file);

 //For file read test propouses only:

/*
    for (int i = 0; i < edgesSize ; i ++)
    {
    printf ("-- %d --", edges[i]);
    }   


    for (int i = 0; i < GRAPHSIZE ; i ++)
    {
    printf ("* %d *", i);
    }
*/  



    bool frontier[GRAPHSIZE]={false}; 
    bool visited[GRAPHSIZE]={false}; 
    int custo[GRAPHSIZE]={0}; 

    int source=0; 
    frontier[source]=true; 

    Node* Va; 
    cudaMalloc((void**)&Va,sizeof(Node)*GRAPHSIZE); 
    cudaMemcpy(Va,node,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    int* Ea; 
    cudaMalloc((void**)&Ea,sizeof(Node)*GRAPHSIZE); 
    cudaMemcpy(Ea,edges,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    bool* Fa; 
    cudaMalloc((void**)&Fa,sizeof(bool)*GRAPHSIZE); 
    cudaMemcpy(Fa,frontier,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    bool* Xa; 
    cudaMalloc((void**)&Xa,sizeof(bool)*GRAPHSIZE); 
    cudaMemcpy(Xa,visited,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    int* Ca; 
    cudaMalloc((void**)&Ca,sizeof(int)*GRAPHSIZE); 
    cudaMemcpy(Ca,custo,sizeof(int)*GRAPHSIZE,cudaMemcpyHostToDevice); 


    dim3 grid(100,100,1); //blocks per grid
    dim3 threads(100,1,1);  // threads per block




    bool para; 
    bool* parada; 
    cudaMalloc((void**)&parada,sizeof(bool)); 
    printf("_____________________________________________\n");
    int count=0;

    cudaEventRecord(start, 0);
    do{ 
        count ++;
        para=false; 
        cudaMemcpy(parada,&para,sizeof(bool),cudaMemcpyHostToDevice);       
        BFS <<<grid,threads,0>>>(Va,Ea,Fa,Xa,Ca,parada);    
        CUT_CHECK_ERROR("kernel1 execution failed"); 
        cudaMemcpy(&para,parada,sizeof(bool),cudaMemcpyDeviceToHost); 


    }while(para); 

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);


    //printf("\nFinal:\n");
    cudaMemcpy(custo,Ca,sizeof(int)*GRAPHSIZE,cudaMemcpyDeviceToHost); 
 /*
    printf("\n_____________________________________________\n");
    for(int i=0;i<GRAPHSIZE;i++) 
        printf("%d  ",custo[i]); 
    printf("\n");

    printf("_____________________________________________\n");
*/

    cudaEventElapsedTime(&time, start, stop);
    printf ("\nTime for the kernel: %lf s \n", time/1000);
    printf ("Number of kernel calls : %d \n", count);


    file = fopen ("graph125MPar","w");


    for(int i=0;i<GRAPHSIZE;i++) 
        fprintf(file,"%d    ",custo[i]); 
    fprintf(file,"\n");
    fclose(file);



}

我在尝试为 1M+ 图形运行它时遇到分段错误(请注意,我使用了更改后的系统堆栈大小)在 Linux 上命令“ulimit -s 16384”)

有人可以帮忙吗?

I'm making a test on a BFS algorithm on CUDA ( wich I know that has some syncronization problems, but it's part of my work to test it anyway ) but i'm having problems on using (or creating?) 1M+ size graphs.

Here's the code I use to create them:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>


#define GRAPHSIZE 1000000

struct Node 
{
    int begin;     // comeco da sub-string de vizinhos
    int num;    // tamanho da sub-string de vizinhos
};



int getSize()
{
int size,arcs;

    printf("Size of the graph: \nNodes:\n>>");
    scanf ("%d", &size);

return size;
}



void createEdges(int graphSize, int* Edges)
{

int j,value, aux, rndIdx;

int edgesSize = 2*GRAPHSIZE;

srand(time(NULL));



printf ("\nGS : %d\n", graphSize);

j = 1;

    for (int i=0; i < edgesSize; i++) //first it creates an ordered array of edges
    {

        if (j < GRAPHSIZE)
        {
        Edges [i] = j;
        j++;
        }
            else
            {
            j=1;        
            Edges [i] = j; 
            j++;
            }

    }



    for (int i=0; i < edgesSize; i++) //now, it randomly swaps the edges array
    {

    rndIdx = rand()%graphSize;

    aux = Edges[rndIdx];
    Edges[rndIdx] = Edges [i];
    Edges [i] = aux;

    }

} 


int main ()
{

int size,graphAtts[2];

int edgesSize = 2*GRAPHSIZE;

int Edges[edgesSize];

struct Node node[GRAPHSIZE];

FILE *file;



printf("____________________________\nRandom graph generator in compact format, optmized for CUDA algorithms by Ianuarivs Severvs.\nFor details about this format read the README. \n");

//size = getSize(graphAtts);

//printf ("%d,%d",size,arcs);

createEdges(GRAPHSIZE,Edges); // or size?

/*
    for (int i = 0; i < edgesSize ; i ++)
    {
    printf ("-- %d --", Edges[i]);
    }   

*/

printf("\nEdges:\n");
for (int i=0; i < edgesSize; i++) 
printf("%d,",Edges[i]);


    for (int i=0,j=0 ; i < GRAPHSIZE; i++,j+=2) // now, completes the graph
    {
    node[i].begin=j;
    node[i].num=2;
    printf ("\n node %d : begin = %d, num = 2",i,j); 
    }

printf("\n");

//writes file:
file = fopen ("graph1M.g","wb");
fwrite (&Edges, edgesSize * sizeof(int),1,file);
fwrite (&node, GRAPHSIZE * sizeof(struct Node),1,file);
fclose(file);


    for (int i = 0; i < edgesSize ; i ++)
    {
    printf ("-- %d --", Edges[i]);
    }   


    for (int i = 0; i < GRAPHSIZE ; i ++)
    {
    printf ("* %d *", i);
    }


}

And here's my BFS code (on CUDA):

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <cutil.h> 

#define GRAPHSIZE 1000000

struct Node 
{
    int begin;     // begining of the substring
    int num;    // size of the sub-string 
};

__global__ void BFS (Node *Va, int *Ea, bool *Fa, bool *Xa, int *Ca, bool *parada) // memory races on both Xa and Ca
{

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid > GRAPHSIZE)
           *parada=true;


        if (Fa[tid] == true && Xa[tid] == false)
        {
        Fa[tid] = false; 
        Xa[tid] = true;
        //__syncthreads(); // this solves the memrace problem as long as the threads are all on the same block
                   for (int i = Va[tid].begin;  i < (Va[tid].begin + Va[tid].num); i++) // Va begin is where it's edges' subarray begins, Va is it's                                                                        number of elements
            {             
                int nid = Ea[i];

                if (Xa[nid] == false)
                {
                Ca[nid] = Ca[tid] + 1;               
                Fa[nid] = true;             
                *parada = true;
                }

            }                   

        }

}

// The BFS frontier corresponds to all the nodes being processed at the current level.


int main()
{

    // for the time couting:
    cudaEvent_t start, stop;
    float time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    FILE * file;

    printf("\nLoading graph file...\n");


    struct Node node[GRAPHSIZE];
    int edgesSize = 2*GRAPHSIZE;
    int edges[edgesSize];


    file = fopen ("graph1M.g","rb");
    printf("abriu");    
    fread (&edges, edgesSize * sizeof(int),1,file);
    fread (&node, GRAPHSIZE * sizeof(struct Node),1,file);
    fclose(file);

 //For file read test propouses only:

/*
    for (int i = 0; i < edgesSize ; i ++)
    {
    printf ("-- %d --", edges[i]);
    }   


    for (int i = 0; i < GRAPHSIZE ; i ++)
    {
    printf ("* %d *", i);
    }
*/  



    bool frontier[GRAPHSIZE]={false}; 
    bool visited[GRAPHSIZE]={false}; 
    int custo[GRAPHSIZE]={0}; 

    int source=0; 
    frontier[source]=true; 

    Node* Va; 
    cudaMalloc((void**)&Va,sizeof(Node)*GRAPHSIZE); 
    cudaMemcpy(Va,node,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    int* Ea; 
    cudaMalloc((void**)&Ea,sizeof(Node)*GRAPHSIZE); 
    cudaMemcpy(Ea,edges,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    bool* Fa; 
    cudaMalloc((void**)&Fa,sizeof(bool)*GRAPHSIZE); 
    cudaMemcpy(Fa,frontier,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    bool* Xa; 
    cudaMalloc((void**)&Xa,sizeof(bool)*GRAPHSIZE); 
    cudaMemcpy(Xa,visited,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    int* Ca; 
    cudaMalloc((void**)&Ca,sizeof(int)*GRAPHSIZE); 
    cudaMemcpy(Ca,custo,sizeof(int)*GRAPHSIZE,cudaMemcpyHostToDevice); 


    dim3 grid(100,100,1); //blocks per grid
    dim3 threads(100,1,1);  // threads per block




    bool para; 
    bool* parada; 
    cudaMalloc((void**)¶da,sizeof(bool)); 
    printf("_____________________________________________\n");
    int count=0;

    cudaEventRecord(start, 0);
    do{ 
        count ++;
        para=false; 
        cudaMemcpy(parada,¶,sizeof(bool),cudaMemcpyHostToDevice);       
        BFS <<<grid,threads,0>>>(Va,Ea,Fa,Xa,Ca,parada);    
        CUT_CHECK_ERROR("kernel1 execution failed"); 
        cudaMemcpy(¶,parada,sizeof(bool),cudaMemcpyDeviceToHost); 


    }while(para); 

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);


    //printf("\nFinal:\n");
    cudaMemcpy(custo,Ca,sizeof(int)*GRAPHSIZE,cudaMemcpyDeviceToHost); 
 /*
    printf("\n_____________________________________________\n");
    for(int i=0;i<GRAPHSIZE;i++) 
        printf("%d  ",custo[i]); 
    printf("\n");

    printf("_____________________________________________\n");
*/

    cudaEventElapsedTime(&time, start, stop);
    printf ("\nTime for the kernel: %lf s \n", time/1000);
    printf ("Number of kernel calls : %d \n", count);


    file = fopen ("graph125MPar","w");


    for(int i=0;i<GRAPHSIZE;i++) 
        fprintf(file,"%d    ",custo[i]); 
    fprintf(file,"\n");
    fclose(file);



}

I'm having a segmantation fault while trying to run it for 1M+ graphs (please note that I used the changed the stack size of the sistem with the command ' ulimit -s 16384 ' on linux)

Can someone help?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

标点 2024-12-21 06:32:45

不要对图表使用静态分配的主机数组,而应使用动态内存分配。您的 ulimit 命令将堆栈大小设置为 16384 kb,但您需要每个图形条目类似于 5*sizeof(int) + 2*sizeof(bool) 的内容,这可能是 22每个条目的字节数。很容易看出有 100 万个条目时堆栈空间将在哪里耗尽。

Don't use statically allocated host arrays for the graph, use dynamic memory allocation instead. Your ulimit command is setting the stacksize to 16384 kb, but you require something like 5*sizeof(int) + 2*sizeof(bool) per graph entry which is probably 22 bytes per entry. It is pretty easy to see where you will run out of stack space with 1 million entries.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文