CUDA BFS 巨图(seg.fault)
我正在 CUDA 上对 BFS 算法进行测试(我知道它存在一些同步问题,但无论如何测试它是我工作的一部分),但我在使用(或创建?)1M+ 大小的图表时遇到问题。
这是我用来创建它们的代码:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define GRAPHSIZE 1000000
struct Node
{
int begin; // comeco da sub-string de vizinhos
int num; // tamanho da sub-string de vizinhos
};
int getSize()
{
int size,arcs;
printf("Size of the graph: \nNodes:\n>>");
scanf ("%d", &size);
return size;
}
void createEdges(int graphSize, int* Edges)
{
int j,value, aux, rndIdx;
int edgesSize = 2*GRAPHSIZE;
srand(time(NULL));
printf ("\nGS : %d\n", graphSize);
j = 1;
for (int i=0; i < edgesSize; i++) //first it creates an ordered array of edges
{
if (j < GRAPHSIZE)
{
Edges [i] = j;
j++;
}
else
{
j=1;
Edges [i] = j;
j++;
}
}
for (int i=0; i < edgesSize; i++) //now, it randomly swaps the edges array
{
rndIdx = rand()%graphSize;
aux = Edges[rndIdx];
Edges[rndIdx] = Edges [i];
Edges [i] = aux;
}
}
int main ()
{
int size,graphAtts[2];
int edgesSize = 2*GRAPHSIZE;
int Edges[edgesSize];
struct Node node[GRAPHSIZE];
FILE *file;
printf("____________________________\nRandom graph generator in compact format, optmized for CUDA algorithms by Ianuarivs Severvs.\nFor details about this format read the README. \n");
//size = getSize(graphAtts);
//printf ("%d,%d",size,arcs);
createEdges(GRAPHSIZE,Edges); // or size?
/*
for (int i = 0; i < edgesSize ; i ++)
{
printf ("-- %d --", Edges[i]);
}
*/
printf("\nEdges:\n");
for (int i=0; i < edgesSize; i++)
printf("%d,",Edges[i]);
for (int i=0,j=0 ; i < GRAPHSIZE; i++,j+=2) // now, completes the graph
{
node[i].begin=j;
node[i].num=2;
printf ("\n node %d : begin = %d, num = 2",i,j);
}
printf("\n");
//writes file:
file = fopen ("graph1M.g","wb");
fwrite (&Edges, edgesSize * sizeof(int),1,file);
fwrite (&node, GRAPHSIZE * sizeof(struct Node),1,file);
fclose(file);
for (int i = 0; i < edgesSize ; i ++)
{
printf ("-- %d --", Edges[i]);
}
for (int i = 0; i < GRAPHSIZE ; i ++)
{
printf ("* %d *", i);
}
}
这是我的 BFS 代码(在 CUDA 上):
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <cutil.h>
#define GRAPHSIZE 1000000
struct Node
{
int begin; // begining of the substring
int num; // size of the sub-string
};
__global__ void BFS (Node *Va, int *Ea, bool *Fa, bool *Xa, int *Ca, bool *parada) // memory races on both Xa and Ca
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid > GRAPHSIZE)
*parada=true;
if (Fa[tid] == true && Xa[tid] == false)
{
Fa[tid] = false;
Xa[tid] = true;
//__syncthreads(); // this solves the memrace problem as long as the threads are all on the same block
for (int i = Va[tid].begin; i < (Va[tid].begin + Va[tid].num); i++) // Va begin is where it's edges' subarray begins, Va is it's number of elements
{
int nid = Ea[i];
if (Xa[nid] == false)
{
Ca[nid] = Ca[tid] + 1;
Fa[nid] = true;
*parada = true;
}
}
}
}
// The BFS frontier corresponds to all the nodes being processed at the current level.
int main()
{
// for the time couting:
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
FILE * file;
printf("\nLoading graph file...\n");
struct Node node[GRAPHSIZE];
int edgesSize = 2*GRAPHSIZE;
int edges[edgesSize];
file = fopen ("graph1M.g","rb");
printf("abriu");
fread (&edges, edgesSize * sizeof(int),1,file);
fread (&node, GRAPHSIZE * sizeof(struct Node),1,file);
fclose(file);
//For file read test propouses only:
/*
for (int i = 0; i < edgesSize ; i ++)
{
printf ("-- %d --", edges[i]);
}
for (int i = 0; i < GRAPHSIZE ; i ++)
{
printf ("* %d *", i);
}
*/
bool frontier[GRAPHSIZE]={false};
bool visited[GRAPHSIZE]={false};
int custo[GRAPHSIZE]={0};
int source=0;
frontier[source]=true;
Node* Va;
cudaMalloc((void**)&Va,sizeof(Node)*GRAPHSIZE);
cudaMemcpy(Va,node,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice);
int* Ea;
cudaMalloc((void**)&Ea,sizeof(Node)*GRAPHSIZE);
cudaMemcpy(Ea,edges,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice);
bool* Fa;
cudaMalloc((void**)&Fa,sizeof(bool)*GRAPHSIZE);
cudaMemcpy(Fa,frontier,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice);
bool* Xa;
cudaMalloc((void**)&Xa,sizeof(bool)*GRAPHSIZE);
cudaMemcpy(Xa,visited,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice);
int* Ca;
cudaMalloc((void**)&Ca,sizeof(int)*GRAPHSIZE);
cudaMemcpy(Ca,custo,sizeof(int)*GRAPHSIZE,cudaMemcpyHostToDevice);
dim3 grid(100,100,1); //blocks per grid
dim3 threads(100,1,1); // threads per block
bool para;
bool* parada;
cudaMalloc((void**)¶da,sizeof(bool));
printf("_____________________________________________\n");
int count=0;
cudaEventRecord(start, 0);
do{
count ++;
para=false;
cudaMemcpy(parada,¶,sizeof(bool),cudaMemcpyHostToDevice);
BFS <<<grid,threads,0>>>(Va,Ea,Fa,Xa,Ca,parada);
CUT_CHECK_ERROR("kernel1 execution failed");
cudaMemcpy(¶,parada,sizeof(bool),cudaMemcpyDeviceToHost);
}while(para);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//printf("\nFinal:\n");
cudaMemcpy(custo,Ca,sizeof(int)*GRAPHSIZE,cudaMemcpyDeviceToHost);
/*
printf("\n_____________________________________________\n");
for(int i=0;i<GRAPHSIZE;i++)
printf("%d ",custo[i]);
printf("\n");
printf("_____________________________________________\n");
*/
cudaEventElapsedTime(&time, start, stop);
printf ("\nTime for the kernel: %lf s \n", time/1000);
printf ("Number of kernel calls : %d \n", count);
file = fopen ("graph125MPar","w");
for(int i=0;i<GRAPHSIZE;i++)
fprintf(file,"%d ",custo[i]);
fprintf(file,"\n");
fclose(file);
}
我在尝试为 1M+ 图形运行它时遇到分段错误(请注意,我使用了更改后的系统堆栈大小)在 Linux 上命令“ulimit -s 16384”)
有人可以帮忙吗?
I'm making a test on a BFS algorithm on CUDA ( wich I know that has some syncronization problems, but it's part of my work to test it anyway ) but i'm having problems on using (or creating?) 1M+ size graphs.
Here's the code I use to create them:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define GRAPHSIZE 1000000
struct Node
{
int begin; // comeco da sub-string de vizinhos
int num; // tamanho da sub-string de vizinhos
};
int getSize()
{
int size,arcs;
printf("Size of the graph: \nNodes:\n>>");
scanf ("%d", &size);
return size;
}
void createEdges(int graphSize, int* Edges)
{
int j,value, aux, rndIdx;
int edgesSize = 2*GRAPHSIZE;
srand(time(NULL));
printf ("\nGS : %d\n", graphSize);
j = 1;
for (int i=0; i < edgesSize; i++) //first it creates an ordered array of edges
{
if (j < GRAPHSIZE)
{
Edges [i] = j;
j++;
}
else
{
j=1;
Edges [i] = j;
j++;
}
}
for (int i=0; i < edgesSize; i++) //now, it randomly swaps the edges array
{
rndIdx = rand()%graphSize;
aux = Edges[rndIdx];
Edges[rndIdx] = Edges [i];
Edges [i] = aux;
}
}
int main ()
{
int size,graphAtts[2];
int edgesSize = 2*GRAPHSIZE;
int Edges[edgesSize];
struct Node node[GRAPHSIZE];
FILE *file;
printf("____________________________\nRandom graph generator in compact format, optmized for CUDA algorithms by Ianuarivs Severvs.\nFor details about this format read the README. \n");
//size = getSize(graphAtts);
//printf ("%d,%d",size,arcs);
createEdges(GRAPHSIZE,Edges); // or size?
/*
for (int i = 0; i < edgesSize ; i ++)
{
printf ("-- %d --", Edges[i]);
}
*/
printf("\nEdges:\n");
for (int i=0; i < edgesSize; i++)
printf("%d,",Edges[i]);
for (int i=0,j=0 ; i < GRAPHSIZE; i++,j+=2) // now, completes the graph
{
node[i].begin=j;
node[i].num=2;
printf ("\n node %d : begin = %d, num = 2",i,j);
}
printf("\n");
//writes file:
file = fopen ("graph1M.g","wb");
fwrite (&Edges, edgesSize * sizeof(int),1,file);
fwrite (&node, GRAPHSIZE * sizeof(struct Node),1,file);
fclose(file);
for (int i = 0; i < edgesSize ; i ++)
{
printf ("-- %d --", Edges[i]);
}
for (int i = 0; i < GRAPHSIZE ; i ++)
{
printf ("* %d *", i);
}
}
And here's my BFS code (on CUDA):
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <cutil.h>
#define GRAPHSIZE 1000000
struct Node
{
int begin; // begining of the substring
int num; // size of the sub-string
};
__global__ void BFS (Node *Va, int *Ea, bool *Fa, bool *Xa, int *Ca, bool *parada) // memory races on both Xa and Ca
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid > GRAPHSIZE)
*parada=true;
if (Fa[tid] == true && Xa[tid] == false)
{
Fa[tid] = false;
Xa[tid] = true;
//__syncthreads(); // this solves the memrace problem as long as the threads are all on the same block
for (int i = Va[tid].begin; i < (Va[tid].begin + Va[tid].num); i++) // Va begin is where it's edges' subarray begins, Va is it's number of elements
{
int nid = Ea[i];
if (Xa[nid] == false)
{
Ca[nid] = Ca[tid] + 1;
Fa[nid] = true;
*parada = true;
}
}
}
}
// The BFS frontier corresponds to all the nodes being processed at the current level.
int main()
{
// for the time couting:
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
FILE * file;
printf("\nLoading graph file...\n");
struct Node node[GRAPHSIZE];
int edgesSize = 2*GRAPHSIZE;
int edges[edgesSize];
file = fopen ("graph1M.g","rb");
printf("abriu");
fread (&edges, edgesSize * sizeof(int),1,file);
fread (&node, GRAPHSIZE * sizeof(struct Node),1,file);
fclose(file);
//For file read test propouses only:
/*
for (int i = 0; i < edgesSize ; i ++)
{
printf ("-- %d --", edges[i]);
}
for (int i = 0; i < GRAPHSIZE ; i ++)
{
printf ("* %d *", i);
}
*/
bool frontier[GRAPHSIZE]={false};
bool visited[GRAPHSIZE]={false};
int custo[GRAPHSIZE]={0};
int source=0;
frontier[source]=true;
Node* Va;
cudaMalloc((void**)&Va,sizeof(Node)*GRAPHSIZE);
cudaMemcpy(Va,node,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice);
int* Ea;
cudaMalloc((void**)&Ea,sizeof(Node)*GRAPHSIZE);
cudaMemcpy(Ea,edges,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice);
bool* Fa;
cudaMalloc((void**)&Fa,sizeof(bool)*GRAPHSIZE);
cudaMemcpy(Fa,frontier,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice);
bool* Xa;
cudaMalloc((void**)&Xa,sizeof(bool)*GRAPHSIZE);
cudaMemcpy(Xa,visited,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice);
int* Ca;
cudaMalloc((void**)&Ca,sizeof(int)*GRAPHSIZE);
cudaMemcpy(Ca,custo,sizeof(int)*GRAPHSIZE,cudaMemcpyHostToDevice);
dim3 grid(100,100,1); //blocks per grid
dim3 threads(100,1,1); // threads per block
bool para;
bool* parada;
cudaMalloc((void**)¶da,sizeof(bool));
printf("_____________________________________________\n");
int count=0;
cudaEventRecord(start, 0);
do{
count ++;
para=false;
cudaMemcpy(parada,¶,sizeof(bool),cudaMemcpyHostToDevice);
BFS <<<grid,threads,0>>>(Va,Ea,Fa,Xa,Ca,parada);
CUT_CHECK_ERROR("kernel1 execution failed");
cudaMemcpy(¶,parada,sizeof(bool),cudaMemcpyDeviceToHost);
}while(para);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//printf("\nFinal:\n");
cudaMemcpy(custo,Ca,sizeof(int)*GRAPHSIZE,cudaMemcpyDeviceToHost);
/*
printf("\n_____________________________________________\n");
for(int i=0;i<GRAPHSIZE;i++)
printf("%d ",custo[i]);
printf("\n");
printf("_____________________________________________\n");
*/
cudaEventElapsedTime(&time, start, stop);
printf ("\nTime for the kernel: %lf s \n", time/1000);
printf ("Number of kernel calls : %d \n", count);
file = fopen ("graph125MPar","w");
for(int i=0;i<GRAPHSIZE;i++)
fprintf(file,"%d ",custo[i]);
fprintf(file,"\n");
fclose(file);
}
I'm having a segmantation fault while trying to run it for 1M+ graphs (please note that I used the changed the stack size of the sistem with the command ' ulimit -s 16384 ' on linux)
Can someone help?
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
不要对图表使用静态分配的主机数组,而应使用动态内存分配。您的 ulimit 命令将堆栈大小设置为 16384 kb,但您需要每个图形条目类似于 5*sizeof(int) + 2*sizeof(bool) 的内容,这可能是 22每个条目的字节数。很容易看出有 100 万个条目时堆栈空间将在哪里耗尽。
Don't use statically allocated host arrays for the graph, use dynamic memory allocation instead. Your
ulimit
command is setting the stacksize to 16384 kb, but you require something like5*sizeof(int) + 2*sizeof(bool)
per graph entry which is probably 22 bytes per entry. It is pretty easy to see where you will run out of stack space with 1 million entries.