使用 mpi 和 mkl 在 Fortran 中进行矩阵乘法

发布于 2025-01-10 00:56:24 字数 5720 浏览 1 评论 0原文

我遇到的问题主要与 MPI 有关。我已经开发了使用 Fortran 中的 mpi 和 mkl 来乘以两个矩阵的代码。它给出了正确的输出，但问题是，如果我增加计算的处理器数量，那么计算所需的时间也会增加，即使用 4 个处理器进行矩阵乘法需要 36 秒，但使用 5 个处理器需要 58 秒。我已经知道，尽管所有处理器的权重几乎相同，但排名 1 的处理器比其他处理器花费更多的时间。即如果 NP=4 则rank=1 需要相同的时间（屏幕截图），其他的是但对于 NP=5，rank=1 比其他人花费更多时间（屏幕截图）。

use mkl_service
include 'mpif.h'

! variables

!------mpi initialization---!

!----someoperation---!

    if(rank.eq.root)then

!       some task then
        
        do i=1,nworkers-1
            call mpi_send(nobsvld, 1, mpi_int, i, 3, mpi_comm_world, ierr)
        end do

! multiplication routine called

        call matrixMultiply(dble(p), dble(htra), phtra, nsea, nsea, nobsvld, nworkers, mpi_comm_world, rank, mpi_int, mpi_real8)
        
        call matrixMultiply(dble(hvld), phtra, hphtra, nobsvld, nsea, nobsvld, nworkers, mpi_comm_world, rank, mpi_int, mpi_real8)

!       some task then

        call matrixMultiply(gmm, inovvld, gain, nobsvld, nobsvld, 1, nworkers, mpi_comm_world, rank, mpi_int, mpi_real8)
    
        call matrixMultiply(phtra, gain, gainres, nsea, nobsvld, 1, nworkers, mpi_comm_world, rank, mpi_int, mpi_real8)

        !       some task then
    else
!---Worker---!

        call mpi_recv(nobsvld, 1, mpi_int, 0, 3, mpi_comm_world, stats, ierr)
        call recv_cal_send(nsea, nsea, nobsvld, mpi_comm_world, mpi_int, mpi_real8,rank)    
            
        call recv_cal_send(nobsvld, nsea, nobsvld, mpi_comm_world, mpi_int, mpi_real8,rank) 
        
        call recv_cal_send(nobsvld, nobsvld, 1, mpi_comm_world, mpi_int, mpi_real8,rank)    
    
        call recv_cal_send(nsea, nobsvld, 1, mpi_comm_world, mpi_int, mpi_real8,rank)   

    end if

    call mpi_finalize(ierr)
end program

!-------------------------------------routines-----------------------------------!

!-------------------routine for root to send as well as calculate some of the portion and receive from workers--------------------!
subroutine matrixMultiply(A, B, C, ax, ay, by, n, comm, rank, dt_int, dt_real)
    integer :: ax, ay, by, rows, averows, extra, e_row, offset, source, ierr
    integer, dimension(5) :: stats  !mpi_status_size==5
    real*8 :: A(ax,ay), B(ay,by), C(ax,by)
    real*8, allocatable :: A_buff(:,:), C_buff(:,:)
    
!-------------------------------------send portions to workers-------------------------------------!    
    offset=1
    averows=ax/n
    extra=modulo(ax,n)
    
    if(extra.gt.0)then
        offset=offset+averows+1
    else
        offset=offset+averows
    end if
    
    do i=1,n-1
        if(extra.gt.i)then
            rows=averows+1
        else 
            rows=averows
        end if
        
        e_row = offset+rows-1

        call mpi_send(rows, 1, dt_int, i, i, comm, ierr)
        call mpi_send(offset, 1, dt_int, i, i, comm, ierr)
        call mpi_send(B, ay*by, dt_real, i, i, comm, ierr)
!call cpu_time(s1)
        allocate(A_buff(rows,ay))
        A_buff(1:rows,1:ay)=A(offset:e_row,1:ay)
        call mpi_send(A_buff, rows*ay, dt_real, i, i, comm, ierr)
!call cpu_time(s2)
!print*,s2-s1," to send to",i
        offset=offset+rows
        deallocate(A_buff)
    end do
    
!-------------------------------------calculate the portion of itself-------------------------------------!
    if(extra.gt.0)then
        rows=averows+1
    else 
        rows=averows
    end if

    allocate(A_buff(rows, ay))
    
    A_buff(1:rows,1:ay)=A(1:rows,1:ay)
    
    allocate(C_buff(rows,by))
call cpu_time(c1)
    call dgemm('N','N', rows, by, ay, 1.d0, A_buff, rows, B, ay, 0.d0, C_buff, rows)
call cpu_time(c2)
print*,c2-c1," for calculation by root"
    deallocate(A_buff)
    
    C(1:rows,1:by)=C_buff(1:rows,1:by)
    
    deallocate(C_buff)
    
!-------------------------------------receive calculated portions from workers -------------------------------------!

    do i=1,n-1
        source=i
        call mpi_recv(rows, 1, dt_int, source, source, comm, stats, ierr)
        call mpi_recv(offset, 1, dt_int, source, source, comm, stats, ierr)
        
        e_row = offset+rows-1
        allocate(C_buff(rows,by))
        call mpi_recv(C_buff, rows*by, dt_real, source, source, comm, stats, ierr)
        
        C(offset:e_row,1:by)=C_buff(1:rows,1:by)

        deallocate(C_buff)
    end do

end subroutine

!---------routine to receive portion from root to calculate and send back----------!
subroutine recv_cal_send(ax, ay, by, comm, dt_int, dt_real,rank)
    
    integer :: comm, rank, dt_int, dt_real, ierr, offset
    integer :: source, ax, ay, bx, by, cy, rows
    integer, dimension(5) :: stats  !mpi_status_size==5
    real*8, dimension(ay,by) :: B
    real*8, allocatable :: A_buffer(:,:)
    real*8, allocatable :: buff(:,:)
    
    bx=ay
    cy=by
    source = 0

    call mpi_recv(rows, 1, dt_int, source, rank, comm, stats, ierr)
    call mpi_recv(offset, 1, dt_int, source, rank, comm, stats, ierr)
    call mpi_recv(B, bx*by, dt_real, source, rank, comm, stats, ierr)
    allocate(A_buffer(rows,ay))
    call mpi_recv(A_buffer, rows*ay, dt_real, source, rank, comm, stats, ierr)

    allocate(buff(rows,cy))
call cpu_time(c1)
    call dgemm('N','N', rows, by, bx , 1.d0, A_buffer, rows, B, bx, 0.d0, buff, rows)
call cpu_time(c2)
print*,c2-c1," for calculation by",rank
    deallocate(A_buffer)
!call cpu_time(s1)

    call mpi_send(rows, 1, dt_int, 0, rank, comm, ierr)
    call mpi_send(offset, 1, dt_int, 0, rank, comm, ierr)
    call mpi_send(buff, rows*cy,dt_real, 0, rank, comm, ierr)
!call cpu_time(s2)
!print*,s2-s1," cal sent by",rank
    deallocate(buff)
end subroutine

原文

The problem I am having is mainly related to MPI. I have developed the code to multiply two matrices using mpi and mkl in fortran. It is giving the correct output but the issue is that if I increase the numberof processors for the calculation then the time taken to calcualte gets increase as well, i.e. it takes 36 seconds to multiply matrices using 4 processors but it takes 58 seconds for 5 processors. And I have come to know that processor with rank 1 takes more time than others inspit of the weight is almost same for all the processors. i.e. if NP=4 then rank=1 takes same amount of time(Screenshot), others are taking but for NP=5, rank=1 is taking more time than others(Screenshot).

use mkl_service
include 'mpif.h'

! variables

!------mpi initialization---!

!----someoperation---!

    if(rank.eq.root)then

!       some task then
        
        do i=1,nworkers-1
            call mpi_send(nobsvld, 1, mpi_int, i, 3, mpi_comm_world, ierr)
        end do

! multiplication routine called

        call matrixMultiply(dble(p), dble(htra), phtra, nsea, nsea, nobsvld, nworkers, mpi_comm_world, rank, mpi_int, mpi_real8)
        
        call matrixMultiply(dble(hvld), phtra, hphtra, nobsvld, nsea, nobsvld, nworkers, mpi_comm_world, rank, mpi_int, mpi_real8)

!       some task then

        call matrixMultiply(gmm, inovvld, gain, nobsvld, nobsvld, 1, nworkers, mpi_comm_world, rank, mpi_int, mpi_real8)
    
        call matrixMultiply(phtra, gain, gainres, nsea, nobsvld, 1, nworkers, mpi_comm_world, rank, mpi_int, mpi_real8)

        !       some task then
    else
!---Worker---!

        call mpi_recv(nobsvld, 1, mpi_int, 0, 3, mpi_comm_world, stats, ierr)
        call recv_cal_send(nsea, nsea, nobsvld, mpi_comm_world, mpi_int, mpi_real8,rank)    
            
        call recv_cal_send(nobsvld, nsea, nobsvld, mpi_comm_world, mpi_int, mpi_real8,rank) 
        
        call recv_cal_send(nobsvld, nobsvld, 1, mpi_comm_world, mpi_int, mpi_real8,rank)    
    
        call recv_cal_send(nsea, nobsvld, 1, mpi_comm_world, mpi_int, mpi_real8,rank)   

    end if

    call mpi_finalize(ierr)
end program

!-------------------------------------routines-----------------------------------!

!-------------------routine for root to send as well as calculate some of the portion and receive from workers--------------------!
subroutine matrixMultiply(A, B, C, ax, ay, by, n, comm, rank, dt_int, dt_real)
    integer :: ax, ay, by, rows, averows, extra, e_row, offset, source, ierr
    integer, dimension(5) :: stats  !mpi_status_size==5
    real*8 :: A(ax,ay), B(ay,by), C(ax,by)
    real*8, allocatable :: A_buff(:,:), C_buff(:,:)
    
!-------------------------------------send portions to workers-------------------------------------!    
    offset=1
    averows=ax/n
    extra=modulo(ax,n)
    
    if(extra.gt.0)then
        offset=offset+averows+1
    else
        offset=offset+averows
    end if
    
    do i=1,n-1
        if(extra.gt.i)then
            rows=averows+1
        else 
            rows=averows
        end if
        
        e_row = offset+rows-1

        call mpi_send(rows, 1, dt_int, i, i, comm, ierr)
        call mpi_send(offset, 1, dt_int, i, i, comm, ierr)
        call mpi_send(B, ay*by, dt_real, i, i, comm, ierr)
!call cpu_time(s1)
        allocate(A_buff(rows,ay))
        A_buff(1:rows,1:ay)=A(offset:e_row,1:ay)
        call mpi_send(A_buff, rows*ay, dt_real, i, i, comm, ierr)
!call cpu_time(s2)
!print*,s2-s1," to send to",i
        offset=offset+rows
        deallocate(A_buff)
    end do
    
!-------------------------------------calculate the portion of itself-------------------------------------!
    if(extra.gt.0)then
        rows=averows+1
    else 
        rows=averows
    end if

    allocate(A_buff(rows, ay))
    
    A_buff(1:rows,1:ay)=A(1:rows,1:ay)
    
    allocate(C_buff(rows,by))
call cpu_time(c1)
    call dgemm('N','N', rows, by, ay, 1.d0, A_buff, rows, B, ay, 0.d0, C_buff, rows)
call cpu_time(c2)
print*,c2-c1," for calculation by root"
    deallocate(A_buff)
    
    C(1:rows,1:by)=C_buff(1:rows,1:by)
    
    deallocate(C_buff)
    
!-------------------------------------receive calculated portions from workers -------------------------------------!

    do i=1,n-1
        source=i
        call mpi_recv(rows, 1, dt_int, source, source, comm, stats, ierr)
        call mpi_recv(offset, 1, dt_int, source, source, comm, stats, ierr)
        
        e_row = offset+rows-1
        allocate(C_buff(rows,by))
        call mpi_recv(C_buff, rows*by, dt_real, source, source, comm, stats, ierr)
        
        C(offset:e_row,1:by)=C_buff(1:rows,1:by)

        deallocate(C_buff)
    end do

end subroutine

!---------routine to receive portion from root to calculate and send back----------!
subroutine recv_cal_send(ax, ay, by, comm, dt_int, dt_real,rank)
    
    integer :: comm, rank, dt_int, dt_real, ierr, offset
    integer :: source, ax, ay, bx, by, cy, rows
    integer, dimension(5) :: stats  !mpi_status_size==5
    real*8, dimension(ay,by) :: B
    real*8, allocatable :: A_buffer(:,:)
    real*8, allocatable :: buff(:,:)
    
    bx=ay
    cy=by
    source = 0

    call mpi_recv(rows, 1, dt_int, source, rank, comm, stats, ierr)
    call mpi_recv(offset, 1, dt_int, source, rank, comm, stats, ierr)
    call mpi_recv(B, bx*by, dt_real, source, rank, comm, stats, ierr)
    allocate(A_buffer(rows,ay))
    call mpi_recv(A_buffer, rows*ay, dt_real, source, rank, comm, stats, ierr)

    allocate(buff(rows,cy))
call cpu_time(c1)
    call dgemm('N','N', rows, by, bx , 1.d0, A_buffer, rows, B, bx, 0.d0, buff, rows)
call cpu_time(c2)
print*,c2-c1," for calculation by",rank
    deallocate(A_buffer)
!call cpu_time(s1)

    call mpi_send(rows, 1, dt_int, 0, rank, comm, ierr)
    call mpi_send(offset, 1, dt_int, 0, rank, comm, ierr)
    call mpi_send(buff, rows*cy,dt_real, 0, rank, comm, ierr)
!call cpu_time(s2)
!print*,s2-s1," cal sent by",rank
    deallocate(buff)
end subroutine

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

小…红帽 2025-01-17 00:56:24

（请注意，缺少完整的示例程序，我必须对这里发生的情况做出一些假设 - 如果提供完整的示例，我将酌情进行修改。另请注意，您拥有的实际核心数量可能存在问题在你的系统中，但据我了解，你的解决方案是与此无关的真正问题）

我担心你编写的方式本质上是不可扩展的，并且它以两种不同的方式不可扩展；它在时间上不可扩展，更糟糕的是，在我看来，它在内存中不可扩展。前者是您观察到的性能不佳的原因，后者意味着使用更多进程将无法解决更大的问题。

为了尝试理解前者的含义，让我们尝试对上述解决方案在 NxN 矩阵的 P 进程上运行时所花费的时间建立一个小模型。执行矩阵乘法的计算所花费的时间将与 N**3/P 成正比，随着 P 的增加，这是一个很好的时间递减函数，很好。但您还必须传递消息，特别是您似乎将所有 B 从 0 级发送到所有其他进程。在编写代码时，该时间将与 P*N**2（忽略延迟）成正比，这是一个随着 P 的时间递增函数 - 使用的进程越多，沟通变得更慢。因此，总时间是（减少的）计算时间与（增加的）通信时间的总和 - 对于足够大的 P，这将趋于无穷大，而不为零；这就是我所说的时间本质上不可扩展的意思。现在，您遇到的问题是计算速度非常快，因此至少对于您正在使用的 N，在通信时间占主导地位之前，它只需要相当适度的 P，并且解决问题的时间将随着进程数量近似线性增加。

您可以通过使用 mpi_bcast （以及最终结果的 mpi_gatherv）来稍微改进这一点。这会将通信时间更改为 Log(P)*N**2，但它仍在增加，因此该算法本质上仍然是不可扩展的。更一般地说，我的经验是，如果你想要性能，这些主从算法根本不值得研究——只有当计算非常昂贵时，它们才有任何价值。

如果增加 N，上述情况可能会有所帮助 - 与通信时间相比，这会使计算时间更加昂贵，因此您更有可能看到由于前者而导致的 P 减少，而不是由于后者而导致的增加，至少P值较大；在某些时候，沟通时间总会占据主导地位。但是因为你保存了所有进程上的所有矩阵，所以你无法解决 P 进程上比 1 进程上更大的问题。这就是我所说的内存本质上不可扩展的意思。

一个好的解决方案将解决这两个问题，并且如果您确实想要一个好的解决方案，那么这两个问题是耦合的并且必须同时得到解决 - 如果您想要良好的并行扩展，则必须使用正确的分布式内存解决方案时间。对于矩阵乘法，示例算法是 Cannon 算法，本着小模型的精神上面，计算时间与 N**3/P 成正比，通信时间与 N**2/Sqrt(P) 成正比 - 请注意，这些都是 P 的递减函数，因此此处的求解时间趋于零；这是一个时间上可扩展的算法。此外，每个进程的内存使用量与 N**2/P 成正比，同样是 P 的递减函数，因此该算法也在内存中进行扩展。但你不应该自己写这个 - 有许多并行库可以实现这些可扩展的矩阵乘法算法，我使用 ScaLapack（另请参阅使用并行矩阵乘法PBLAS）并且 MKL 有该库的一个版本可用；还有免费版本。

编辑：
其他一些想法

您已确定使用单线程 BLAS 调用吗？
如果您只对单节点并行性感兴趣，请使用并行 BLAS 调用 - 您将以最少的投入获得出色的性能

(Note lacking a complete example program I'm having to make a few assumptions about what is going on here - should a full example be provided I will modify as appropriate. Note also there may be an issue with the number of real cores you have in your system, but as I understand it your solution his real problems that are independent of this)

I am afraid that the way you have written this is inherently non-scalable, and it's non-scalable in two different ways; it is not scalable in time and, worse to my mind, it is not scalable in memory. The former is the cause behind the poor performance you are observing, the latter means that using more processes will not allow you to solve bigger problems.

To try and understand what I mean by the former let's try and make a little model of the time taken by the solution above when running on P processes for NxN matrices. The time taken for the compute that performs the matrix multiplication will be proportional to N**3/P , which is a nicely decreasing function of time as P increases, good. But you also have to pass messages, and in particular you seem to send all of B from rank 0 to all the other processes. The time for this will, as the code is written, be proportional to P*N**2 (ignoring latency), which is an increasing function of time with P - the more processes you use, the slower the communication becomes. Thus the total time is a sum of the (decreasing) compute time with the (increasing) communication time - and for sufficiently large P this will tend to infinity, not zero; this is what I mean by inherently non-scalable in time. Now the problem you have in your case is the compute is quite quick, so at least for the N you are using it only requires quite modest P before the communication time dominates, and the time to solution will increase approximately linearly with the number of processes.

You can marginally improve this by using mpi_bcast (and mpi_gatherv for the final result). This will change the communication time to Log(P)*N**2, but it's still increasing, and so the algorithm is still inherently non-scalable. And more generally my experience is that these master-slave algorithms are rarely worth investigating at all if you want performance - only if the compute is ferociously expensive do they have any worth.

The above could be helped if you increase N - this makes the compute time more expensive compared to the communication time and so you are more likely to see the decrease with P due to the former, than the increase due to the latter at least to a larger value of P; at some point the communication time will always take over. But because you hold all the matrices on all processes you can't solve any bigger a problem on P processes than you can on 1. This is what I mean by inherently non-scalable in memory.

A good solution to this will address both problems, and if you really want a good solution both issues are coupled and must both be addressed - you must use a properly distributed memory solution if you want good parallel scaling in time. For matrix multiplication an example algorithm is Cannon's Algorithm which, in the spirit of the little model above, has compute time proportional to N**3/P and communication time to N**2/Sqrt(P) - note these are both decreasing functions of P, so the time to solution here tends to zero; this is a scalable algorithm in time. Further the memory use per processes is proportional to N**2/P, again a decreasing function of P, and so this algorithm is also scaling in memory. But you shouldn't write this yourself - a number of parallel libraries are available which implement these scalable algorithms for matrix multiplication, I use ScaLapack (see also parallel matrix multiplication using PBLAS) and MKL has a version of this library available; there are also free versions.

Edit:
A couple of other thoughts

You have made sure you are using single threaded BLAS calls?
If you are only interested in single node parallelism use a parallel BLAS call - you will get great performance for minimal investment of effort

回复收藏 0 原文

~没有更多了~