在 MPI 上发送向量的分段错误
我有一个用向量表示的 3d 立方体,它在 MPI 进程之间垂直划分为多个面。为了进行计算,我需要传递face+1和face-1以便能够比较极值。当我发送和接收失败时,问题就出现了。当索引正确时我没有发现问题。
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include "aux.h"
#define STABILITY 1.0f / sqrt(3.0f)
void mdf_heat(double *u0,
double *u1,
double *aux,
const unsigned int npX,
const unsigned int npY,
const unsigned int npZ,
const double deltaH,
const double deltaT,
const double inErr,
const double boundaries,
const int me,
const int np)
{
double left, right, up, down, top, bottom;
double alpha = deltaT / (deltaH * deltaH);
MPI_Status status;
int continued = 1;
unsigned int steps = 0;
while (continued)
{
steps++;
if (me == 0)
{
MPI_Send(&u0[npX * npY * (npZ - 1)], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD);
// printf("(S:%d)(P:%d) Envio a %d\n", steps, me, me + 1);
MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD, &status);
printf("(S:%d)(P:%d) Recibo de %d\n", steps, me, me + 1);
}
else if (me == np - 1)
{
MPI_Send(&u0[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD);
// printf("(S:%d)(P:%d) Envio a %d\n", steps, me, me - 1);
MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD, &status);
printf("(S:%d)(P:%d) Recibo de %d\n", steps, me, me - 1);
}
else
{
MPI_Send(&u0[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD);
MPI_Send(&u0[npX * npY * (npZ - 1)], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD);
// printf("(S:%d)(P:%d) Envio a %d y %d\n", steps, me, me - 1, me + 1);
MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD, &status);
MPI_Recv(&aux[npX * npY], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD, &status);
printf("(S:%d)(P:%d) Recibo de %d y %d\n", steps, me, me - 1, me + 1);
}
....
}
fprintf(stdout, "[%d] Done! in %u steps\n", me, steps);
}
int main(int ac, char **av)
{
....
unsigned int npX = (unsigned int)(sizeX / deltaH);
unsigned int npY = (unsigned int)(sizeY / deltaH);
unsigned int npZ = (unsigned int)(sizeZ / deltaH) / np;
u0_per_process = (double *)calloc(npZ * npY * npX, sizeof(double));
u1_per_process = (double *)calloc(npZ * npY * npX, sizeof(double));
if (me == 0 || me == np - 1)
aux_per_process = (double *)calloc(npZ * npY * 1, sizeof(double));
else
aux_per_process = (double *)calloc(npZ * npY * 2, sizeof(double));
printf("p(%d) (%u, %u, %u)\n", me, npX, npY, npZ);
mdf_heat(u0_per_process, u1_per_process, aux_per_process, npX, npY, npZ, deltaH, deltaT, 1e-15, 100.0f, me, np);
....
}
控制台错误:
mpirun ./a.out 0.125
p(1) (8, 8, 2)
p(3) (8, 8, 2)
p(2) (8, 8, 2)
p(0) (8, 8, 2)
(S:1)(P:3) Recibo de 2
(S:1)(P:0) Recibo de 1
(S:1)(P:2) Recibo de 1 y 3
(S:1)(P:1) Recibo de 0 y 2
[mateev:113552] *** Process received signal ***
[mateev:113549] *** Process received signal ***
[mateev:113549] Signal: Segmentation fault (11)
[mateev:113549] Signal code: Address not mapped (1)
[mateev:113549] Failing at address: 0x48
[mateev:113550] *** Process received signal ***
[mateev:113550] Signal: Segmentation fault (11)
[mateev:113550] Signal code: Address not mapped (1)
[mateev:113550] Failing at address: 0x48
[mateev:113552] Signal: Segmentation fault (11)
[mateev:113552] Signal code: Address not mapped (1)
[mateev:113552] Failing at address: 0x48
a.out: malloc.c:4036: _int_malloc: Assertion `(unsigned long) (size) >= (unsigned long) (nb)' failed.
malloc(): invalid size (unsorted)
[mateev:113552] *** Process received signal ***
[mateev:113552] Signal: Aborted (6)
[mateev:113552] Signal code: (-6)
[mateev:113550] *** Process received signal ***
[mateev:113550] Signal: Aborted (6)
[mateev:113550] Signal code: (-6)
[mateev:113549] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x430c0)[0x7f8bb12180c0]
[mateev:113549] [ 1] /home/vladimir/.openmpi/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x77)[0x7f8bb0177417]
[mateev:113549] [ 2] /home/vladimir/.openmpi/lib/libmpi.so.40(PMPI_Send+0x123)[0x7f8bb1488fb3]
[mateev:113549] [ 3] ./a.out(+0x1419)[0x55899d429419]
[mateev:113549] [ 4] ./a.out(+0x1965)[0x55899d429965]
[mateev:113549] [ 5] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f8bb11f90b3]
[mateev:113549] [ 6] ./a.out(+0x128e)[0x55899d42928e]
[mateev:113549] *** End of error message ***
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node mateev exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
我尝试使用索引,如果我只发送 1 个双精度数据,则不会出现分段错误。 但不能发全脸。
I have a 3d cube represented in a vector where it is divided vertically into faces between the MPI processes. In order to do the calculation I need to pass the face+1 and the face-1 to be able to compare the extremes. The problem comes when I make the sending and receiving that fails. I do not find the problem when the indices are correct.
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include "aux.h"
#define STABILITY 1.0f / sqrt(3.0f)
void mdf_heat(double *u0,
double *u1,
double *aux,
const unsigned int npX,
const unsigned int npY,
const unsigned int npZ,
const double deltaH,
const double deltaT,
const double inErr,
const double boundaries,
const int me,
const int np)
{
double left, right, up, down, top, bottom;
double alpha = deltaT / (deltaH * deltaH);
MPI_Status status;
int continued = 1;
unsigned int steps = 0;
while (continued)
{
steps++;
if (me == 0)
{
MPI_Send(&u0[npX * npY * (npZ - 1)], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD);
// printf("(S:%d)(P:%d) Envio a %d\n", steps, me, me + 1);
MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD, &status);
printf("(S:%d)(P:%d) Recibo de %d\n", steps, me, me + 1);
}
else if (me == np - 1)
{
MPI_Send(&u0[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD);
// printf("(S:%d)(P:%d) Envio a %d\n", steps, me, me - 1);
MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD, &status);
printf("(S:%d)(P:%d) Recibo de %d\n", steps, me, me - 1);
}
else
{
MPI_Send(&u0[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD);
MPI_Send(&u0[npX * npY * (npZ - 1)], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD);
// printf("(S:%d)(P:%d) Envio a %d y %d\n", steps, me, me - 1, me + 1);
MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD, &status);
MPI_Recv(&aux[npX * npY], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD, &status);
printf("(S:%d)(P:%d) Recibo de %d y %d\n", steps, me, me - 1, me + 1);
}
....
}
fprintf(stdout, "[%d] Done! in %u steps\n", me, steps);
}
int main(int ac, char **av)
{
....
unsigned int npX = (unsigned int)(sizeX / deltaH);
unsigned int npY = (unsigned int)(sizeY / deltaH);
unsigned int npZ = (unsigned int)(sizeZ / deltaH) / np;
u0_per_process = (double *)calloc(npZ * npY * npX, sizeof(double));
u1_per_process = (double *)calloc(npZ * npY * npX, sizeof(double));
if (me == 0 || me == np - 1)
aux_per_process = (double *)calloc(npZ * npY * 1, sizeof(double));
else
aux_per_process = (double *)calloc(npZ * npY * 2, sizeof(double));
printf("p(%d) (%u, %u, %u)\n", me, npX, npY, npZ);
mdf_heat(u0_per_process, u1_per_process, aux_per_process, npX, npY, npZ, deltaH, deltaT, 1e-15, 100.0f, me, np);
....
}
Console error:
mpirun ./a.out 0.125
p(1) (8, 8, 2)
p(3) (8, 8, 2)
p(2) (8, 8, 2)
p(0) (8, 8, 2)
(S:1)(P:3) Recibo de 2
(S:1)(P:0) Recibo de 1
(S:1)(P:2) Recibo de 1 y 3
(S:1)(P:1) Recibo de 0 y 2
[mateev:113552] *** Process received signal ***
[mateev:113549] *** Process received signal ***
[mateev:113549] Signal: Segmentation fault (11)
[mateev:113549] Signal code: Address not mapped (1)
[mateev:113549] Failing at address: 0x48
[mateev:113550] *** Process received signal ***
[mateev:113550] Signal: Segmentation fault (11)
[mateev:113550] Signal code: Address not mapped (1)
[mateev:113550] Failing at address: 0x48
[mateev:113552] Signal: Segmentation fault (11)
[mateev:113552] Signal code: Address not mapped (1)
[mateev:113552] Failing at address: 0x48
a.out: malloc.c:4036: _int_malloc: Assertion `(unsigned long) (size) >= (unsigned long) (nb)' failed.
malloc(): invalid size (unsorted)
[mateev:113552] *** Process received signal ***
[mateev:113552] Signal: Aborted (6)
[mateev:113552] Signal code: (-6)
[mateev:113550] *** Process received signal ***
[mateev:113550] Signal: Aborted (6)
[mateev:113550] Signal code: (-6)
[mateev:113549] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x430c0)[0x7f8bb12180c0]
[mateev:113549] [ 1] /home/vladimir/.openmpi/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x77)[0x7f8bb0177417]
[mateev:113549] [ 2] /home/vladimir/.openmpi/lib/libmpi.so.40(PMPI_Send+0x123)[0x7f8bb1488fb3]
[mateev:113549] [ 3] ./a.out(+0x1419)[0x55899d429419]
[mateev:113549] [ 4] ./a.out(+0x1965)[0x55899d429965]
[mateev:113549] [ 5] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f8bb11f90b3]
[mateev:113549] [ 6] ./a.out(+0x128e)[0x55899d42928e]
[mateev:113549] *** End of error message ***
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node mateev exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
I have tried to play with the indexes, if I only send 1 double it doesn't have the segmentation fault.
But cannot send the entire face.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论