Fortran 中的 MPI_Scatterv 因中止陷阱信号而崩溃

发布于 2025-01-12 09:04:35 字数 6798 浏览 2 评论 0原文

我用Fortran写了一个主从IO函数。首先,我使用 0 进程读取文件,将数据放入数组 read_buffer 中,然后调用子例程“scatter_data”。
我创建了一些通信器来将数据从进程 0 分散到进程 0-8。
这些通信器是这样的:

2,  5,  8  :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
1,  4,  7  :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
0,  3,  6  :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
2,  1,  0  :master_communicator

                      DATA
                       ||
                 0 process read
                       ||
                        0           call MPI_scatterv in communicator "master_communicator"    
                /       |      \    
               0        1        2  call MPI_scatterv in communicator "sub_io_communicator" 
             / | \    / | \    / | \
            0  3  6  1  4  7  2  5  8  

但是当我调用 MPI_Scatterv 时,它崩溃了。我使用“print”对其进行调试,发现错误出在“call MPI_Scatterv”中。所以,我在这个子例程中写了一个非常简单的 MPI_Scatterv 来看看它是否会起作用,但它不起作用。

我的代码是这样的:

SUBROUTINE scatter_data(read_buffer,ne_in)

use naqpms_nest, only : nest, nxlo, nylo, ratio, nx, ny
implicit none
include 'mpif.h'

integer, INTENT(IN) :: ne_in
integer :: ierr
integer :: location
integer :: ii, jj, kk, zz, dd, send_size,receive_size
real, INTENT(IN), dimension(nx(ne_in)*ny(ne_in)) :: read_buffer
integer, dimension(nx(ne_in)*ny(ne_in)) :: read_buffer_int
real,     allocatable  :: rerange_buffer(:)   
integer,     allocatable  :: receive_buffer_int(:)   
integer , allocatable  :: counts_recv(:),displacements(:)
integer                :: distance,left_bdy,left_bdy2,rigth_bdy,rigth_bdy2
integer :: tmp1(3), tmp2(3)

IF( sub_iorank.EQ.0 ) THEN
    if(allocated(counts_recv))deallocate(counts_recv)
    if(allocated(displacements))deallocate(displacements)
    if(allocated(receive_buffer))deallocate(receive_buffer)
    if(allocated(rerange_buffer))deallocate(rerange_buffer)

    IF( master_iorank.EQ.0) THEN
        allocate(counts_recv(dims(2,ne_in)))
        allocate(displacements(dims(2,ne_in)))
    ENDIF

    print*,"location",my_rank,nx(ne_in), ey(ne_in), sy(ne_in)

    receive_size = nx(ne_in) * (ey(ne_in)-sy(ne_in)+1)
    allocate(receive_buffer(receive_size))
    allocate(receive_buffer_int(receive_size))
    allocate(rerange_buffer(receive_size))

    CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER,&
                     0, master_communicator ,ierr )


    tmp1(1:3)= (/1,1,1/)
    tmp2(1:3)= (/0,1,2/)
    IF(my_rank==0)print*,"counts_recv",counts_recv        
    CALL mpi_scatterv (counts_recv, tmp1, tmp2, MPI_INT,&                       ! this is just for test
                       receive_size, 1, mpi_int, 0, master_communicator, ierr )
    print*,my_rank,receive_size

    IF(my_rank.EQ.0) THEN
        displacements(1)=0
        do ii=2, dims(2,ne_in)
            displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
        enddo
    ENDIF
    IF(my_rank==0)print*,displacements,counts_recv
    CALL mpi_scatterv (read_buffer, counts_recv, displacements, mpi_real,&
                       receive_buffer, receive_size, mpi_real, 0, master_communicator, ierr )

    IF(my_rank==0)print*,"mpi_scatterv one ok",my_rank
ENDIF!sub_iorank =0



IF(sub_iorank .EQ. 0) THEN
    if(allocated(counts_recv))deallocate(counts_recv)
    if(allocated(displacements))deallocate(displacements)
    allocate(counts_recv(dims(1,ne_in)))
    allocate(displacements(dims(1,ne_in)))
ENDIF

receive_size =(ex(ne_in)-sx(ne_in)+1)*(ey(ne_in)-sy(ne_in)+1)
CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER, 0, sub_io_communicator ,ierr )

IF(sub_iorank .EQ. 0) THEN
    displacements(1)=0                
    do ii=2,dims(1,ne_in)                 
        displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
    enddo
ENDIF

IF(sub_iorank .EQ. 0) THEN
    DO dd = 1, dims(1,ne_in)
        DO jj = 1, bdy_gather(4,dd,ne_in)-bdy_gather(3,dd,ne_in)+1
            distance = bdy_gather(2,dd,ne_in)-bdy_gather(1,dd,ne_in)+1
            left_bdy = (jj-1)*nx(ne_in) + bdy_gather(1,dd,ne_in)
            rigth_bdy = (jj-1)*nx(ne_in) + bdy_gather(2,dd,ne_in)
            left_bdy2 = displacements(dd) + (jj-1)*distance + 1  
            rigth_bdy2 = displacements(dd) + (jj-1)*distance + distance
            rerange_buffer( left_bdy2 : rigth_bdy2) = receive_buffer(left_bdy : rigth_bdy )

        ENDDO
    ENDDO
ENDIF

if(allocated(receive_buffer))deallocate(receive_buffer)
allocate(receive_buffer(receive_size))
IF(sub_iorank .EQ. 0) print*, my_rank, counts_recv, displacements
 CALL mpi_scatterv( rerange_buffer, counts_recv, displacements, mpi_real,&
                     receive_buffer, receive_size, mpi_real, 0, sub_io_communicator,ierr)
IF(sub_iorank .EQ. 0) print*,"mpi_scatterv ok"

END SUBROUTINE scatter_data

我运行此代码: mpirun -np 9 ./gnaqpms.v1.6.0_jx0307.exe

然后,日志文件中的错误如下:

 location           0          88          26           1
 location           1          88          52          27
 location           2          88          77          53
 counts_recv        2288        2288        2200
           1        2288
           2        2200
*** Error in forrtl: error (76): Abort trap signal
Image              PC                Routine            Line        Source
gnaqpms.v1.6.0_jx  00000000007A5F3A  Unknown               Unknown  Unknown
libpthread-2.17.s  00002BA00DADA5D0  Unknown               Unknown  Unknown
libc-2.17.so       00002BA00E01F207  gsignal               Unknown  Unknown
libc-2.17.so       00002BA00E0208F8  abort                 Unknown  Unknown
libc-2.17.so       00002BA00E061D27  Unknown               Unknown  Unknown
libc-2.17.so       00002BA00E06A489  Unknown               Unknown  Unknown
libmpi.so.12.0     00002BA00CAC2AED  Unknown               Unknown  Unknown
libmpi.so.12.0     00002BA00CAC4A54  Unknown               Unknown  Unknown
libmpi.so.12       00002BA00CAC3188  MPI_Scatterv          Unknown  Unknown
libmpifort.so.12.  00002BA00D445A7A  mpi_scatterv          Unknown  Unknown
gnaqpms.v1.6.0_jx  0000000000475EA4  naqpms_parallel_m        1269  naqpms_parallel.f90
gnaqpms.v1.6.0_jx  00000000005953BF  rd_met_pyramid_           151  rd_met_pyramid.f90
gnaqpms.v1.6.0_jx  0000000000617709  read_data_                 61  naqpms_readdata.f90
gnaqpms.v1.6.0_jx  0000000000647BA4  naqpms_calc_mp_ca         141  naqpms_calc.f90
gnaqpms.v1.6.0_jx  000000000065A835  MAIN__                     86  main.f90
gnaqpms.v1.6.0_jx  000000000040B45E  Unknown               Unknown  Unknown
libc-2.17.so       00002BA00E00B3D5  __libc_start_main     Unknown  Unknown
gnaqpms.v1.6.0_jx  000000000040B369  Unknown               Unknown  Unknown

I have write a master-slave IO function with Fortran. First, I read the file with 0 process, put the data in the array read_buffer, and then I call subroutine"scatter_data".
I have created some communicators to scatter data from process 0 to process 0-8.
These communicators are like this:

2,  5,  8  :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
1,  4,  7  :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
0,  3,  6  :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
2,  1,  0  :master_communicator

                      DATA
                       ||
                 0 process read
                       ||
                        0           call MPI_scatterv in communicator "master_communicator"    
                /       |      \    
               0        1        2  call MPI_scatterv in communicator "sub_io_communicator" 
             / | \    / | \    / | \
            0  3  6  1  4  7  2  5  8  

but when I call MPI_Scatterv, it crashes. I use "print" to debug it , and find the bug is in "call MPI_Scatterv". SO, I write a very simple MPI_Scatterv in this subroutine to see whether it will work, but it does not.

My code is like this :

SUBROUTINE scatter_data(read_buffer,ne_in)

use naqpms_nest, only : nest, nxlo, nylo, ratio, nx, ny
implicit none
include 'mpif.h'

integer, INTENT(IN) :: ne_in
integer :: ierr
integer :: location
integer :: ii, jj, kk, zz, dd, send_size,receive_size
real, INTENT(IN), dimension(nx(ne_in)*ny(ne_in)) :: read_buffer
integer, dimension(nx(ne_in)*ny(ne_in)) :: read_buffer_int
real,     allocatable  :: rerange_buffer(:)   
integer,     allocatable  :: receive_buffer_int(:)   
integer , allocatable  :: counts_recv(:),displacements(:)
integer                :: distance,left_bdy,left_bdy2,rigth_bdy,rigth_bdy2
integer :: tmp1(3), tmp2(3)

IF( sub_iorank.EQ.0 ) THEN
    if(allocated(counts_recv))deallocate(counts_recv)
    if(allocated(displacements))deallocate(displacements)
    if(allocated(receive_buffer))deallocate(receive_buffer)
    if(allocated(rerange_buffer))deallocate(rerange_buffer)

    IF( master_iorank.EQ.0) THEN
        allocate(counts_recv(dims(2,ne_in)))
        allocate(displacements(dims(2,ne_in)))
    ENDIF

    print*,"location",my_rank,nx(ne_in), ey(ne_in), sy(ne_in)

    receive_size = nx(ne_in) * (ey(ne_in)-sy(ne_in)+1)
    allocate(receive_buffer(receive_size))
    allocate(receive_buffer_int(receive_size))
    allocate(rerange_buffer(receive_size))

    CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER,&
                     0, master_communicator ,ierr )


    tmp1(1:3)= (/1,1,1/)
    tmp2(1:3)= (/0,1,2/)
    IF(my_rank==0)print*,"counts_recv",counts_recv        
    CALL mpi_scatterv (counts_recv, tmp1, tmp2, MPI_INT,&                       ! this is just for test
                       receive_size, 1, mpi_int, 0, master_communicator, ierr )
    print*,my_rank,receive_size

    IF(my_rank.EQ.0) THEN
        displacements(1)=0
        do ii=2, dims(2,ne_in)
            displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
        enddo
    ENDIF
    IF(my_rank==0)print*,displacements,counts_recv
    CALL mpi_scatterv (read_buffer, counts_recv, displacements, mpi_real,&
                       receive_buffer, receive_size, mpi_real, 0, master_communicator, ierr )

    IF(my_rank==0)print*,"mpi_scatterv one ok",my_rank
ENDIF!sub_iorank =0



IF(sub_iorank .EQ. 0) THEN
    if(allocated(counts_recv))deallocate(counts_recv)
    if(allocated(displacements))deallocate(displacements)
    allocate(counts_recv(dims(1,ne_in)))
    allocate(displacements(dims(1,ne_in)))
ENDIF

receive_size =(ex(ne_in)-sx(ne_in)+1)*(ey(ne_in)-sy(ne_in)+1)
CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER, 0, sub_io_communicator ,ierr )

IF(sub_iorank .EQ. 0) THEN
    displacements(1)=0                
    do ii=2,dims(1,ne_in)                 
        displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
    enddo
ENDIF

IF(sub_iorank .EQ. 0) THEN
    DO dd = 1, dims(1,ne_in)
        DO jj = 1, bdy_gather(4,dd,ne_in)-bdy_gather(3,dd,ne_in)+1
            distance = bdy_gather(2,dd,ne_in)-bdy_gather(1,dd,ne_in)+1
            left_bdy = (jj-1)*nx(ne_in) + bdy_gather(1,dd,ne_in)
            rigth_bdy = (jj-1)*nx(ne_in) + bdy_gather(2,dd,ne_in)
            left_bdy2 = displacements(dd) + (jj-1)*distance + 1  
            rigth_bdy2 = displacements(dd) + (jj-1)*distance + distance
            rerange_buffer( left_bdy2 : rigth_bdy2) = receive_buffer(left_bdy : rigth_bdy )

        ENDDO
    ENDDO
ENDIF

if(allocated(receive_buffer))deallocate(receive_buffer)
allocate(receive_buffer(receive_size))
IF(sub_iorank .EQ. 0) print*, my_rank, counts_recv, displacements
 CALL mpi_scatterv( rerange_buffer, counts_recv, displacements, mpi_real,&
                     receive_buffer, receive_size, mpi_real, 0, sub_io_communicator,ierr)
IF(sub_iorank .EQ. 0) print*,"mpi_scatterv ok"

END SUBROUTINE scatter_data

I run this code : mpirun -np 9 ./gnaqpms.v1.6.0_jx0307.exe

then, the error in the log file is like this :

 location           0          88          26           1
 location           1          88          52          27
 location           2          88          77          53
 counts_recv        2288        2288        2200
           1        2288
           2        2200
*** Error in forrtl: error (76): Abort trap signal
Image              PC                Routine            Line        Source
gnaqpms.v1.6.0_jx  00000000007A5F3A  Unknown               Unknown  Unknown
libpthread-2.17.s  00002BA00DADA5D0  Unknown               Unknown  Unknown
libc-2.17.so       00002BA00E01F207  gsignal               Unknown  Unknown
libc-2.17.so       00002BA00E0208F8  abort                 Unknown  Unknown
libc-2.17.so       00002BA00E061D27  Unknown               Unknown  Unknown
libc-2.17.so       00002BA00E06A489  Unknown               Unknown  Unknown
libmpi.so.12.0     00002BA00CAC2AED  Unknown               Unknown  Unknown
libmpi.so.12.0     00002BA00CAC4A54  Unknown               Unknown  Unknown
libmpi.so.12       00002BA00CAC3188  MPI_Scatterv          Unknown  Unknown
libmpifort.so.12.  00002BA00D445A7A  mpi_scatterv          Unknown  Unknown
gnaqpms.v1.6.0_jx  0000000000475EA4  naqpms_parallel_m        1269  naqpms_parallel.f90
gnaqpms.v1.6.0_jx  00000000005953BF  rd_met_pyramid_           151  rd_met_pyramid.f90
gnaqpms.v1.6.0_jx  0000000000617709  read_data_                 61  naqpms_readdata.f90
gnaqpms.v1.6.0_jx  0000000000647BA4  naqpms_calc_mp_ca         141  naqpms_calc.f90
gnaqpms.v1.6.0_jx  000000000065A835  MAIN__                     86  main.f90
gnaqpms.v1.6.0_jx  000000000040B45E  Unknown               Unknown  Unknown
libc-2.17.so       00002BA00E00B3D5  __libc_start_main     Unknown  Unknown
gnaqpms.v1.6.0_jx  000000000040B369  Unknown               Unknown  Unknown

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文