在 MPI 上发送向量的分段错误

发布于 2025-01-17 06:01:13 字数 5338 浏览 1 评论 0原文

我有一个用向量表示的 3d 立方体,它在 MPI 进程之间垂直划分为多个面。为了进行计算,我需要传递face+1和face-1以便能够比较极值。当我发送和接收失败时,问题就出现了。当索引正确时我没有发现问题。

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include "aux.h"
#define STABILITY 1.0f / sqrt(3.0f)

void mdf_heat(double *u0,
              double *u1,
              double *aux,
              const unsigned int npX,
              const unsigned int npY,
              const unsigned int npZ,
              const double deltaH,
              const double deltaT,
              const double inErr,
              const double boundaries,
              const int me,
              const int np)
{
  double left, right, up, down, top, bottom;
  double alpha = deltaT / (deltaH * deltaH);
  MPI_Status status;
  int continued = 1;
  unsigned int steps = 0;

  while (continued)
  {
    steps++;
    if (me == 0)
    {
      MPI_Send(&u0[npX * npY * (npZ - 1)], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD);
      // printf("(S:%d)(P:%d) Envio a %d\n", steps, me, me + 1);

      MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD, &status);
      printf("(S:%d)(P:%d) Recibo de %d\n", steps, me, me + 1);
    }
    else if (me == np - 1)
    {
      MPI_Send(&u0[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD);
      // printf("(S:%d)(P:%d) Envio a %d\n", steps, me, me - 1);

      MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD, &status);
      printf("(S:%d)(P:%d) Recibo de %d\n", steps, me, me - 1);
    }
    else
    {
      MPI_Send(&u0[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD);
      MPI_Send(&u0[npX * npY * (npZ - 1)], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD);
      // printf("(S:%d)(P:%d) Envio a %d y %d\n", steps, me, me - 1, me + 1);

      MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD, &status);
      MPI_Recv(&aux[npX * npY], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD, &status);
      printf("(S:%d)(P:%d) Recibo de %d y %d\n", steps, me, me - 1, me + 1);
    }

    ....

   }
   fprintf(stdout, "[%d] Done! in %u steps\n", me, steps);
}

int main(int ac, char **av)
{

  ....

  unsigned int npX = (unsigned int)(sizeX / deltaH);
  unsigned int npY = (unsigned int)(sizeY / deltaH);
  unsigned int npZ = (unsigned int)(sizeZ / deltaH) / np;

  u0_per_process = (double *)calloc(npZ * npY * npX, sizeof(double));
  u1_per_process = (double *)calloc(npZ * npY * npX, sizeof(double));

  if (me == 0 || me == np - 1)
    aux_per_process = (double *)calloc(npZ * npY * 1, sizeof(double));
  else
    aux_per_process = (double *)calloc(npZ * npY * 2, sizeof(double));

  printf("p(%d) (%u, %u, %u)\n", me, npX, npY, npZ);
  mdf_heat(u0_per_process, u1_per_process, aux_per_process, npX, npY, npZ, deltaH, deltaT, 1e-15, 100.0f, me, np);

  ....

}

控制台错误:

mpirun ./a.out 0.125
p(1) (8, 8, 2)
p(3) (8, 8, 2)
p(2) (8, 8, 2)
p(0) (8, 8, 2)
(S:1)(P:3) Recibo de 2
(S:1)(P:0) Recibo de 1
(S:1)(P:2) Recibo de 1 y 3
(S:1)(P:1) Recibo de 0 y 2
[mateev:113552] *** Process received signal ***
[mateev:113549] *** Process received signal ***
[mateev:113549] Signal: Segmentation fault (11)
[mateev:113549] Signal code: Address not mapped (1)
[mateev:113549] Failing at address: 0x48
[mateev:113550] *** Process received signal ***
[mateev:113550] Signal: Segmentation fault (11)
[mateev:113550] Signal code: Address not mapped (1)
[mateev:113550] Failing at address: 0x48
[mateev:113552] Signal: Segmentation fault (11)
[mateev:113552] Signal code: Address not mapped (1)
[mateev:113552] Failing at address: 0x48
a.out: malloc.c:4036: _int_malloc: Assertion `(unsigned long) (size) >= (unsigned long) (nb)' failed.
malloc(): invalid size (unsorted)
[mateev:113552] *** Process received signal ***
[mateev:113552] Signal: Aborted (6)
[mateev:113552] Signal code:  (-6)
[mateev:113550] *** Process received signal ***
[mateev:113550] Signal: Aborted (6)
[mateev:113550] Signal code:  (-6)
[mateev:113549] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x430c0)[0x7f8bb12180c0]
[mateev:113549] [ 1] /home/vladimir/.openmpi/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x77)[0x7f8bb0177417]
[mateev:113549] [ 2] /home/vladimir/.openmpi/lib/libmpi.so.40(PMPI_Send+0x123)[0x7f8bb1488fb3]
[mateev:113549] [ 3] ./a.out(+0x1419)[0x55899d429419]
[mateev:113549] [ 4] ./a.out(+0x1965)[0x55899d429965]
[mateev:113549] [ 5] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f8bb11f90b3]
[mateev:113549] [ 6] ./a.out(+0x128e)[0x55899d42928e]
[mateev:113549] *** End of error message ***
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node mateev exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------

我尝试使用索引,如果我只发送 1 个双精度数据,则不会出现分段错误。 但不能发全脸。

I have a 3d cube represented in a vector where it is divided vertically into faces between the MPI processes. In order to do the calculation I need to pass the face+1 and the face-1 to be able to compare the extremes. The problem comes when I make the sending and receiving that fails. I do not find the problem when the indices are correct.

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include "aux.h"
#define STABILITY 1.0f / sqrt(3.0f)

void mdf_heat(double *u0,
              double *u1,
              double *aux,
              const unsigned int npX,
              const unsigned int npY,
              const unsigned int npZ,
              const double deltaH,
              const double deltaT,
              const double inErr,
              const double boundaries,
              const int me,
              const int np)
{
  double left, right, up, down, top, bottom;
  double alpha = deltaT / (deltaH * deltaH);
  MPI_Status status;
  int continued = 1;
  unsigned int steps = 0;

  while (continued)
  {
    steps++;
    if (me == 0)
    {
      MPI_Send(&u0[npX * npY * (npZ - 1)], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD);
      // printf("(S:%d)(P:%d) Envio a %d\n", steps, me, me + 1);

      MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD, &status);
      printf("(S:%d)(P:%d) Recibo de %d\n", steps, me, me + 1);
    }
    else if (me == np - 1)
    {
      MPI_Send(&u0[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD);
      // printf("(S:%d)(P:%d) Envio a %d\n", steps, me, me - 1);

      MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD, &status);
      printf("(S:%d)(P:%d) Recibo de %d\n", steps, me, me - 1);
    }
    else
    {
      MPI_Send(&u0[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD);
      MPI_Send(&u0[npX * npY * (npZ - 1)], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD);
      // printf("(S:%d)(P:%d) Envio a %d y %d\n", steps, me, me - 1, me + 1);

      MPI_Recv(&aux[0], npX * npY, MPI_DOUBLE, me - 1, 0, MPI_COMM_WORLD, &status);
      MPI_Recv(&aux[npX * npY], npX * npY, MPI_DOUBLE, me + 1, 0, MPI_COMM_WORLD, &status);
      printf("(S:%d)(P:%d) Recibo de %d y %d\n", steps, me, me - 1, me + 1);
    }

    ....

   }
   fprintf(stdout, "[%d] Done! in %u steps\n", me, steps);
}

int main(int ac, char **av)
{

  ....

  unsigned int npX = (unsigned int)(sizeX / deltaH);
  unsigned int npY = (unsigned int)(sizeY / deltaH);
  unsigned int npZ = (unsigned int)(sizeZ / deltaH) / np;

  u0_per_process = (double *)calloc(npZ * npY * npX, sizeof(double));
  u1_per_process = (double *)calloc(npZ * npY * npX, sizeof(double));

  if (me == 0 || me == np - 1)
    aux_per_process = (double *)calloc(npZ * npY * 1, sizeof(double));
  else
    aux_per_process = (double *)calloc(npZ * npY * 2, sizeof(double));

  printf("p(%d) (%u, %u, %u)\n", me, npX, npY, npZ);
  mdf_heat(u0_per_process, u1_per_process, aux_per_process, npX, npY, npZ, deltaH, deltaT, 1e-15, 100.0f, me, np);

  ....

}

Console error:

mpirun ./a.out 0.125
p(1) (8, 8, 2)
p(3) (8, 8, 2)
p(2) (8, 8, 2)
p(0) (8, 8, 2)
(S:1)(P:3) Recibo de 2
(S:1)(P:0) Recibo de 1
(S:1)(P:2) Recibo de 1 y 3
(S:1)(P:1) Recibo de 0 y 2
[mateev:113552] *** Process received signal ***
[mateev:113549] *** Process received signal ***
[mateev:113549] Signal: Segmentation fault (11)
[mateev:113549] Signal code: Address not mapped (1)
[mateev:113549] Failing at address: 0x48
[mateev:113550] *** Process received signal ***
[mateev:113550] Signal: Segmentation fault (11)
[mateev:113550] Signal code: Address not mapped (1)
[mateev:113550] Failing at address: 0x48
[mateev:113552] Signal: Segmentation fault (11)
[mateev:113552] Signal code: Address not mapped (1)
[mateev:113552] Failing at address: 0x48
a.out: malloc.c:4036: _int_malloc: Assertion `(unsigned long) (size) >= (unsigned long) (nb)' failed.
malloc(): invalid size (unsorted)
[mateev:113552] *** Process received signal ***
[mateev:113552] Signal: Aborted (6)
[mateev:113552] Signal code:  (-6)
[mateev:113550] *** Process received signal ***
[mateev:113550] Signal: Aborted (6)
[mateev:113550] Signal code:  (-6)
[mateev:113549] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x430c0)[0x7f8bb12180c0]
[mateev:113549] [ 1] /home/vladimir/.openmpi/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x77)[0x7f8bb0177417]
[mateev:113549] [ 2] /home/vladimir/.openmpi/lib/libmpi.so.40(PMPI_Send+0x123)[0x7f8bb1488fb3]
[mateev:113549] [ 3] ./a.out(+0x1419)[0x55899d429419]
[mateev:113549] [ 4] ./a.out(+0x1965)[0x55899d429965]
[mateev:113549] [ 5] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f8bb11f90b3]
[mateev:113549] [ 6] ./a.out(+0x128e)[0x55899d42928e]
[mateev:113549] *** End of error message ***
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node mateev exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------

I have tried to play with the indexes, if I only send 1 double it doesn't have the segmentation fault.
But cannot send the entire face.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文