Fortran 中的 MPI_Scatterv 因中止陷阱信号而崩溃
我用Fortran写了一个主从IO函数。首先,我使用 0 进程读取文件,将数据放入数组 read_buffer
中,然后调用子例程“scatter_data”。
我创建了一些通信器来将数据从进程 0 分散到进程 0-8。
这些通信器是这样的:
2, 5, 8 :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
1, 4, 7 :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
0, 3, 6 :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
2, 1, 0 :master_communicator
DATA
||
0 process read
||
0 call MPI_scatterv in communicator "master_communicator"
/ | \
0 1 2 call MPI_scatterv in communicator "sub_io_communicator"
/ | \ / | \ / | \
0 3 6 1 4 7 2 5 8
但是当我调用 MPI_Scatterv 时,它崩溃了。我使用“print”对其进行调试,发现错误出在“call MPI_Scatterv”中。所以,我在这个子例程中写了一个非常简单的 MPI_Scatterv 来看看它是否会起作用,但它不起作用。
我的代码是这样的:
SUBROUTINE scatter_data(read_buffer,ne_in)
use naqpms_nest, only : nest, nxlo, nylo, ratio, nx, ny
implicit none
include 'mpif.h'
integer, INTENT(IN) :: ne_in
integer :: ierr
integer :: location
integer :: ii, jj, kk, zz, dd, send_size,receive_size
real, INTENT(IN), dimension(nx(ne_in)*ny(ne_in)) :: read_buffer
integer, dimension(nx(ne_in)*ny(ne_in)) :: read_buffer_int
real, allocatable :: rerange_buffer(:)
integer, allocatable :: receive_buffer_int(:)
integer , allocatable :: counts_recv(:),displacements(:)
integer :: distance,left_bdy,left_bdy2,rigth_bdy,rigth_bdy2
integer :: tmp1(3), tmp2(3)
IF( sub_iorank.EQ.0 ) THEN
if(allocated(counts_recv))deallocate(counts_recv)
if(allocated(displacements))deallocate(displacements)
if(allocated(receive_buffer))deallocate(receive_buffer)
if(allocated(rerange_buffer))deallocate(rerange_buffer)
IF( master_iorank.EQ.0) THEN
allocate(counts_recv(dims(2,ne_in)))
allocate(displacements(dims(2,ne_in)))
ENDIF
print*,"location",my_rank,nx(ne_in), ey(ne_in), sy(ne_in)
receive_size = nx(ne_in) * (ey(ne_in)-sy(ne_in)+1)
allocate(receive_buffer(receive_size))
allocate(receive_buffer_int(receive_size))
allocate(rerange_buffer(receive_size))
CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER,&
0, master_communicator ,ierr )
tmp1(1:3)= (/1,1,1/)
tmp2(1:3)= (/0,1,2/)
IF(my_rank==0)print*,"counts_recv",counts_recv
CALL mpi_scatterv (counts_recv, tmp1, tmp2, MPI_INT,& ! this is just for test
receive_size, 1, mpi_int, 0, master_communicator, ierr )
print*,my_rank,receive_size
IF(my_rank.EQ.0) THEN
displacements(1)=0
do ii=2, dims(2,ne_in)
displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
enddo
ENDIF
IF(my_rank==0)print*,displacements,counts_recv
CALL mpi_scatterv (read_buffer, counts_recv, displacements, mpi_real,&
receive_buffer, receive_size, mpi_real, 0, master_communicator, ierr )
IF(my_rank==0)print*,"mpi_scatterv one ok",my_rank
ENDIF!sub_iorank =0
IF(sub_iorank .EQ. 0) THEN
if(allocated(counts_recv))deallocate(counts_recv)
if(allocated(displacements))deallocate(displacements)
allocate(counts_recv(dims(1,ne_in)))
allocate(displacements(dims(1,ne_in)))
ENDIF
receive_size =(ex(ne_in)-sx(ne_in)+1)*(ey(ne_in)-sy(ne_in)+1)
CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER, 0, sub_io_communicator ,ierr )
IF(sub_iorank .EQ. 0) THEN
displacements(1)=0
do ii=2,dims(1,ne_in)
displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
enddo
ENDIF
IF(sub_iorank .EQ. 0) THEN
DO dd = 1, dims(1,ne_in)
DO jj = 1, bdy_gather(4,dd,ne_in)-bdy_gather(3,dd,ne_in)+1
distance = bdy_gather(2,dd,ne_in)-bdy_gather(1,dd,ne_in)+1
left_bdy = (jj-1)*nx(ne_in) + bdy_gather(1,dd,ne_in)
rigth_bdy = (jj-1)*nx(ne_in) + bdy_gather(2,dd,ne_in)
left_bdy2 = displacements(dd) + (jj-1)*distance + 1
rigth_bdy2 = displacements(dd) + (jj-1)*distance + distance
rerange_buffer( left_bdy2 : rigth_bdy2) = receive_buffer(left_bdy : rigth_bdy )
ENDDO
ENDDO
ENDIF
if(allocated(receive_buffer))deallocate(receive_buffer)
allocate(receive_buffer(receive_size))
IF(sub_iorank .EQ. 0) print*, my_rank, counts_recv, displacements
CALL mpi_scatterv( rerange_buffer, counts_recv, displacements, mpi_real,&
receive_buffer, receive_size, mpi_real, 0, sub_io_communicator,ierr)
IF(sub_iorank .EQ. 0) print*,"mpi_scatterv ok"
END SUBROUTINE scatter_data
我运行此代码: mpirun -np 9 ./gnaqpms.v1.6.0_jx0307.exe
然后,日志文件中的错误如下:
location 0 88 26 1
location 1 88 52 27
location 2 88 77 53
counts_recv 2288 2288 2200
1 2288
2 2200
*** Error in forrtl: error (76): Abort trap signal
Image PC Routine Line Source
gnaqpms.v1.6.0_jx 00000000007A5F3A Unknown Unknown Unknown
libpthread-2.17.s 00002BA00DADA5D0 Unknown Unknown Unknown
libc-2.17.so 00002BA00E01F207 gsignal Unknown Unknown
libc-2.17.so 00002BA00E0208F8 abort Unknown Unknown
libc-2.17.so 00002BA00E061D27 Unknown Unknown Unknown
libc-2.17.so 00002BA00E06A489 Unknown Unknown Unknown
libmpi.so.12.0 00002BA00CAC2AED Unknown Unknown Unknown
libmpi.so.12.0 00002BA00CAC4A54 Unknown Unknown Unknown
libmpi.so.12 00002BA00CAC3188 MPI_Scatterv Unknown Unknown
libmpifort.so.12. 00002BA00D445A7A mpi_scatterv Unknown Unknown
gnaqpms.v1.6.0_jx 0000000000475EA4 naqpms_parallel_m 1269 naqpms_parallel.f90
gnaqpms.v1.6.0_jx 00000000005953BF rd_met_pyramid_ 151 rd_met_pyramid.f90
gnaqpms.v1.6.0_jx 0000000000617709 read_data_ 61 naqpms_readdata.f90
gnaqpms.v1.6.0_jx 0000000000647BA4 naqpms_calc_mp_ca 141 naqpms_calc.f90
gnaqpms.v1.6.0_jx 000000000065A835 MAIN__ 86 main.f90
gnaqpms.v1.6.0_jx 000000000040B45E Unknown Unknown Unknown
libc-2.17.so 00002BA00E00B3D5 __libc_start_main Unknown Unknown
gnaqpms.v1.6.0_jx 000000000040B369 Unknown Unknown Unknown
I have write a master-slave IO function with Fortran. First, I read the file with 0 process, put the data in the array read_buffer
, and then I call subroutine"scatter_data".
I have created some communicators to scatter data from process 0 to process 0-8.
These communicators are like this:
2, 5, 8 :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
1, 4, 7 :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
0, 3, 6 :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
2, 1, 0 :master_communicator
DATA
||
0 process read
||
0 call MPI_scatterv in communicator "master_communicator"
/ | \
0 1 2 call MPI_scatterv in communicator "sub_io_communicator"
/ | \ / | \ / | \
0 3 6 1 4 7 2 5 8
but when I call MPI_Scatterv, it crashes. I use "print" to debug it , and find the bug is in "call MPI_Scatterv". SO, I write a very simple MPI_Scatterv in this subroutine to see whether it will work, but it does not.
My code is like this :
SUBROUTINE scatter_data(read_buffer,ne_in)
use naqpms_nest, only : nest, nxlo, nylo, ratio, nx, ny
implicit none
include 'mpif.h'
integer, INTENT(IN) :: ne_in
integer :: ierr
integer :: location
integer :: ii, jj, kk, zz, dd, send_size,receive_size
real, INTENT(IN), dimension(nx(ne_in)*ny(ne_in)) :: read_buffer
integer, dimension(nx(ne_in)*ny(ne_in)) :: read_buffer_int
real, allocatable :: rerange_buffer(:)
integer, allocatable :: receive_buffer_int(:)
integer , allocatable :: counts_recv(:),displacements(:)
integer :: distance,left_bdy,left_bdy2,rigth_bdy,rigth_bdy2
integer :: tmp1(3), tmp2(3)
IF( sub_iorank.EQ.0 ) THEN
if(allocated(counts_recv))deallocate(counts_recv)
if(allocated(displacements))deallocate(displacements)
if(allocated(receive_buffer))deallocate(receive_buffer)
if(allocated(rerange_buffer))deallocate(rerange_buffer)
IF( master_iorank.EQ.0) THEN
allocate(counts_recv(dims(2,ne_in)))
allocate(displacements(dims(2,ne_in)))
ENDIF
print*,"location",my_rank,nx(ne_in), ey(ne_in), sy(ne_in)
receive_size = nx(ne_in) * (ey(ne_in)-sy(ne_in)+1)
allocate(receive_buffer(receive_size))
allocate(receive_buffer_int(receive_size))
allocate(rerange_buffer(receive_size))
CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER,&
0, master_communicator ,ierr )
tmp1(1:3)= (/1,1,1/)
tmp2(1:3)= (/0,1,2/)
IF(my_rank==0)print*,"counts_recv",counts_recv
CALL mpi_scatterv (counts_recv, tmp1, tmp2, MPI_INT,& ! this is just for test
receive_size, 1, mpi_int, 0, master_communicator, ierr )
print*,my_rank,receive_size
IF(my_rank.EQ.0) THEN
displacements(1)=0
do ii=2, dims(2,ne_in)
displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
enddo
ENDIF
IF(my_rank==0)print*,displacements,counts_recv
CALL mpi_scatterv (read_buffer, counts_recv, displacements, mpi_real,&
receive_buffer, receive_size, mpi_real, 0, master_communicator, ierr )
IF(my_rank==0)print*,"mpi_scatterv one ok",my_rank
ENDIF!sub_iorank =0
IF(sub_iorank .EQ. 0) THEN
if(allocated(counts_recv))deallocate(counts_recv)
if(allocated(displacements))deallocate(displacements)
allocate(counts_recv(dims(1,ne_in)))
allocate(displacements(dims(1,ne_in)))
ENDIF
receive_size =(ex(ne_in)-sx(ne_in)+1)*(ey(ne_in)-sy(ne_in)+1)
CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER, 0, sub_io_communicator ,ierr )
IF(sub_iorank .EQ. 0) THEN
displacements(1)=0
do ii=2,dims(1,ne_in)
displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
enddo
ENDIF
IF(sub_iorank .EQ. 0) THEN
DO dd = 1, dims(1,ne_in)
DO jj = 1, bdy_gather(4,dd,ne_in)-bdy_gather(3,dd,ne_in)+1
distance = bdy_gather(2,dd,ne_in)-bdy_gather(1,dd,ne_in)+1
left_bdy = (jj-1)*nx(ne_in) + bdy_gather(1,dd,ne_in)
rigth_bdy = (jj-1)*nx(ne_in) + bdy_gather(2,dd,ne_in)
left_bdy2 = displacements(dd) + (jj-1)*distance + 1
rigth_bdy2 = displacements(dd) + (jj-1)*distance + distance
rerange_buffer( left_bdy2 : rigth_bdy2) = receive_buffer(left_bdy : rigth_bdy )
ENDDO
ENDDO
ENDIF
if(allocated(receive_buffer))deallocate(receive_buffer)
allocate(receive_buffer(receive_size))
IF(sub_iorank .EQ. 0) print*, my_rank, counts_recv, displacements
CALL mpi_scatterv( rerange_buffer, counts_recv, displacements, mpi_real,&
receive_buffer, receive_size, mpi_real, 0, sub_io_communicator,ierr)
IF(sub_iorank .EQ. 0) print*,"mpi_scatterv ok"
END SUBROUTINE scatter_data
I run this code : mpirun -np 9 ./gnaqpms.v1.6.0_jx0307.exe
then, the error in the log file is like this :
location 0 88 26 1
location 1 88 52 27
location 2 88 77 53
counts_recv 2288 2288 2200
1 2288
2 2200
*** Error in forrtl: error (76): Abort trap signal
Image PC Routine Line Source
gnaqpms.v1.6.0_jx 00000000007A5F3A Unknown Unknown Unknown
libpthread-2.17.s 00002BA00DADA5D0 Unknown Unknown Unknown
libc-2.17.so 00002BA00E01F207 gsignal Unknown Unknown
libc-2.17.so 00002BA00E0208F8 abort Unknown Unknown
libc-2.17.so 00002BA00E061D27 Unknown Unknown Unknown
libc-2.17.so 00002BA00E06A489 Unknown Unknown Unknown
libmpi.so.12.0 00002BA00CAC2AED Unknown Unknown Unknown
libmpi.so.12.0 00002BA00CAC4A54 Unknown Unknown Unknown
libmpi.so.12 00002BA00CAC3188 MPI_Scatterv Unknown Unknown
libmpifort.so.12. 00002BA00D445A7A mpi_scatterv Unknown Unknown
gnaqpms.v1.6.0_jx 0000000000475EA4 naqpms_parallel_m 1269 naqpms_parallel.f90
gnaqpms.v1.6.0_jx 00000000005953BF rd_met_pyramid_ 151 rd_met_pyramid.f90
gnaqpms.v1.6.0_jx 0000000000617709 read_data_ 61 naqpms_readdata.f90
gnaqpms.v1.6.0_jx 0000000000647BA4 naqpms_calc_mp_ca 141 naqpms_calc.f90
gnaqpms.v1.6.0_jx 000000000065A835 MAIN__ 86 main.f90
gnaqpms.v1.6.0_jx 000000000040B45E Unknown Unknown Unknown
libc-2.17.so 00002BA00E00B3D5 __libc_start_main Unknown Unknown
gnaqpms.v1.6.0_jx 000000000040B369 Unknown Unknown Unknown
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论