与 c++ 的结果存在相当大的差异和 fortran 进行相同的计算

发布于 2024-11-15 07:57:29 字数 6651 浏览 4 评论 0原文

我正在将 FORTRAN 程序翻译为 CUDA。
当我翻译一个子程序时,我发现结果差异很大,从分数的第三位开始!我读到它们确实有所不同(我读到差异相对来说非常小)。不知道程序有没有问题。所以我也用 C++ 编译了代码,但得到了与 CUDA 相同的结果。我发布了这两个代码(C++ 和 FORTRAN)。 请检查。

FORTRAN 代码(原始)

      PROGRAM MAIN
      IMPLICIT REAL(A-H,O-Z)
      parameter (nda=3,nda3=nda*3,ND05=3)
c      INCLUDE 'SIZES'
C
C         CALCULATE LENNARD-JONES POTENTIAL ENERGY DERIVATIVES
C
      COMMON/QPDOT/Q(NDA3),PDOT(NDA3)
      COMMON/COORS/R(NDA*(NDA+1)/2)
      COMMON/LENJB/ALJ(ND05),BLJ(ND05),CLJ(ND05),N5J(ND05),N5K(ND05),
     *NREP(ND05),MREP(ND05),LREP(ND05)
      DIMENSION JKA(ND05),RNA(ND05),RMB(ND05),RLC(ND05)
      common/ind/natoms,i3n

  831 FORMAT('   NJ=',I4,'  NK=',I4,'  ALJ=',1PE12.5,'  BLJ=',E12.5,
     *'  CLJ=',E12.5,'  NREP=',I4,'  MREP=',I4,'  LREP=',I4)
  815 FORMAT(/)

      read*,natoms
      read*,nlj
      i3n=natoms*3
c terms for calcluations, N5J & N5K are indices (N5K > N5J)
c ALJ, BLJ & CLJ are powers for generalized LENNARD-JONES potential.
         DO 10 I=1,NLJ
            READ(5,*)N5J(I),N5K(I),NREP(I),MREP(I),LREP(I),
     *               ALJ(I),BLJ(I),CLJ(I)
            WRITE(6,831)N5J(I),N5K(I),ALJ(I),BLJ(I),CLJ(I),
     *                  NREP(I),MREP(I),LREP(I)
         WRITE(6,815)
10    continue
C
          read(5,*)(q(i),i=1,i3n)

      do 30 i=1,i3n
        print*,'q (',i,') = ',q(i)
30    continue

      DO 40 MN=1,i3n
         PDOT(MN)=0.0
40    continue
c setting zero values for pdot, which is force
      IF (NLJ.NE.0) THEN
         CALL LENJ(1,NLJ)
      ENDIF


      end


      SUBROUTINE LENJ(INL,LNL)
      IMPLICIT REAL (A-H,O-Z)
      parameter (nda=3,nda3=nda*3,ND05=3)
C         CALCULATE LENNARD-JONES POTENTIAL ENERGY DERIVATIVES
C

      COMMON/QPDOT/Q(NDA3),PDOT(NDA3)
      COMMON/COORS/R(NDA*(NDA+1)/2)
      COMMON/LENJB/ALJ(ND05),BLJ(ND05),CLJ(ND05),N5J(ND05),N5K(ND05),
     *NREP(ND05),MREP(ND05),LREP(ND05)
      DIMENSION JKA(ND05),RNA(ND05),RMB(ND05),RLC(ND05)
      LOGICAL FIRST
      DATA FIRST/.TRUE./
      save FIRST,JKA,RNA,RMB,RLC
      common/ind/natoms,i3n


      IF (FIRST) THEN
         DO NL=INL,LNL
            JKA(NL)=ISHFT((N5J(NL)-1)*(2*NATOMS-N5J(NL)),-1)+N5K(NL)
     *             -N5J(NL)
      c to avoid recounting
            RNA(NL)=-NREP(NL)*ALJ(NL)
            RMB(NL)=-MREP(NL)*BLJ(NL)
            RLC(NL)=-LREP(NL)*CLJ(NL)
         ENDDO
C
         FIRST=.FALSE.
      ENDIF
C
C         CODE FOR GENERAL LENNARD-JONES
C
      DO NL=INL,LNL
         J3=3*N5J(NL)
         J2=J3-1
         J1=J2-1
         K3=3*N5K(NL)
         K2=K3-1
         K1=K2-1
      c since each atom is represented by 3 coordinates in space, indices above are used to point for a step of three.
         JK=JKA(NL)
         T1=Q(K1)-Q(J1)
         T2=Q(K2)-Q(J2)
         T3=Q(K3)-Q(J3)
         R(JK)=SQRT(T1*T1+T2*T2+T3*T3)
       c calculating the distance
         RRJK=1.0/R(JK)
         DUM1=RNA(NL)*RRJK**(2+NREP(NL))
         DUM1=DUM1+RMB(NL)*RRJK**(MREP(NL)+2)
         DUM1=DUM1+RLC(NL)*RRJK**(LREP(NL)+2)
       c adding each terms
         TDUM1=DUM1*T1
         TDUM2=DUM1*T2
         TDUM3=DUM1*T3
         PDOT(K1)=PDOT(K1)+TDUM1
         PDOT(K2)=PDOT(K2)+TDUM2
         PDOT(K3)=PDOT(K3)+TDUM3
         PDOT(J1)=PDOT(J1)-TDUM1
         PDOT(J2)=PDOT(J2)-TDUM2
         PDOT(J3)=PDOT(J3)-TDUM3
       c final forces.
      ENDDO
      print*,'...............'
      do i=1,i3n
        print*,'i = ',i ,', dvdq = ',pdot(i)
      enddo
C   N5J(I), N5K(I), NREP(I), MREP(I), LREP(I), ALJ(I), BLJ(I) and CLJ(I) for I = 1, NLJ.
C   This is information for the generalized Lennard-Jones potentials.
C   N5J and N5K are the atoms with N5K greater than N5J.
C   NREP, MREP, and LREP are the powers in the potential energy expression.
C   If a power equals zero the appropriate term is skipped in the calculations.
      return
      END

C++ 代码

#include<iostream>
#include<fstream>
#include<math.h>
#include<stdio.h>
#include<stdlib.h>
using namespace std;

void lenjones(int NATOMS, int* N5J, int* N5K, int* NREP, int* MREP, int* LREP, float* ALJ, float* BLJ, float* CLJ, int NLJ,float* Q, float* PDOT){

size_t NLJF = NLJ*sizeof(float);
size_t NLJI = NLJ*sizeof(float);

float* RMB = (float*)malloc(NLJF);
float* RLC = (float*)malloc(NLJF);
float* RNA = (float*)malloc(NLJF);
int *JKA=(int*)malloc(NLJI);
for (int NL=0; NL < NLJ; NL++){
    JKA[NL]= (((N5J[NL]-1)*((2*NATOMS)-N5J[NL]))>> 1) +N5K[NL]-N5J[NL];
    RNA[NL]=-NREP[NL]*ALJ[NL];
    RMB[NL]=-MREP[NL]*BLJ[NL];
    RLC[NL]=-LREP[NL]*CLJ[NL];
}

int J3,K3;
float T1,T2,T3;
float TDUM1,TDUM2,TDUM3,DUM;
size_t RRS = (NATOMS*(NATOMS+1)/2)*sizeof(float);
float* RR = (float*)malloc(RRS);
int JK;
for (int NL=0; NL < NLJ; NL++){
    J3=(3*N5J[NL])-1;
    K3=(3*N5K[NL])-1;
    T1=Q[K3-2]-Q[J3-2];
    T2=Q[K3-1]-Q[J3-1];
    T3=Q[K3]-Q[J3];
    JK=JKA[NL]-1;
    RR[JK]=sqrtf((T1*T1)+(T2*T2)+(T3*T3));
    RR[JK]=1/RR[JK];
    DUM=(RNA[NL]*powf(RR[JK],2+NREP[NL]));
    DUM+=(RMB[NL]*powf(RR[JK],MREP[NL]+2));
    DUM+=(RLC[NL]*powf(RR[JK],LREP[NL]+2));
    TDUM1=T1*DUM;
    TDUM2=T2*DUM;
    TDUM3=T3*DUM;
    PDOT[K3-2]=PDOT[K3-2]+TDUM1;
    PDOT[K3-1]=PDOT[K3-1]+TDUM2;
    PDOT[K3]=PDOT[K3]+TDUM3;
    PDOT[J3-2]=PDOT[J3-2]-TDUM1;
    PDOT[J3-1]=PDOT[J3-1]-TDUM2;
    PDOT[J3]=PDOT[J3]-TDUM3;
}

}


//=========================================
int main(){

int NATOMS,NDA3,ND05;
scanf("%d %d", &NATOMS, &ND05);
printf("\n");
NDA3=3*NATOMS;
size_t QPDOT = NDA3*sizeof(float);
float* h_Q = (float*)malloc(QPDOT);
float* h_PDOT = (float*)malloc(QPDOT);

size_t NLJ = ND05*sizeof(float);

float* h_ALJ = (float*)malloc(NLJ);
float* h_BLJ = (float*)malloc(NLJ);
float* h_CLJ = (float*)malloc(NLJ);

size_t NLJ_i = ND05*sizeof(int);
int* h_LREP = (int*)malloc(NLJ_i);
int* h_MREP = (int*)malloc(NLJ_i);
int* h_NREP = (int*)malloc(NLJ_i);
int* h_N5J = (int*)malloc(NLJ_i);
int* h_N5K = (int*)malloc(NLJ_i);
int* h_JKA = (int*)malloc(NLJ_i);

for (int i=0; i< ND05; i++){
    cin >> h_N5J[i] >> h_N5K[i] >> h_NREP[i] >> h_MREP[i] >> h_LREP[i] >> h_ALJ[i] >> h_BLJ[i] >> h_CLJ[i];
}
for (int i=0; i<NDA3; i++){
    scanf("%f", &h_Q[i]);
}
for (int i=0; i<NDA3; i++){
    h_PDOT[i]=0;
}

lenjones(NATOMS, &h_N5J[0], &h_N5K[0], &h_NREP[0], &h_MREP[0], &h_LREP[0], &h_ALJ[0], &h_BLJ[0], &h_CLJ[0], ND05, &h_Q[0], &h_PDOT[0]);

cout << "i  " << "Q[i]  " << "PDOT[i]" << endl;
for (int i=0; i<NDA3; i++){
    printf("%d  %e  %le \n" , i, h_Q[i], h_PDOT[i]);
}

}

I am translating a FORTRAN program to CUDA.
When I translated one subroutine I found that results vary considerably, starting from the 3rd digit in the fraction! I've read that they do differ (I've read that the difference would be very small relatively). I don't know if there's anything wrong in the program. So I compiled the code in C++ too, but to get the same result as in CUDA. I am posting both the codes (C++ and FORTRAN).
pls check.

FORTRAN CODE (ORIGINAL)

      PROGRAM MAIN
      IMPLICIT REAL(A-H,O-Z)
      parameter (nda=3,nda3=nda*3,ND05=3)
c      INCLUDE 'SIZES'
C
C         CALCULATE LENNARD-JONES POTENTIAL ENERGY DERIVATIVES
C
      COMMON/QPDOT/Q(NDA3),PDOT(NDA3)
      COMMON/COORS/R(NDA*(NDA+1)/2)
      COMMON/LENJB/ALJ(ND05),BLJ(ND05),CLJ(ND05),N5J(ND05),N5K(ND05),
     *NREP(ND05),MREP(ND05),LREP(ND05)
      DIMENSION JKA(ND05),RNA(ND05),RMB(ND05),RLC(ND05)
      common/ind/natoms,i3n

  831 FORMAT('   NJ=',I4,'  NK=',I4,'  ALJ=',1PE12.5,'  BLJ=',E12.5,
     *'  CLJ=',E12.5,'  NREP=',I4,'  MREP=',I4,'  LREP=',I4)
  815 FORMAT(/)

      read*,natoms
      read*,nlj
      i3n=natoms*3
c terms for calcluations, N5J & N5K are indices (N5K > N5J)
c ALJ, BLJ & CLJ are powers for generalized LENNARD-JONES potential.
         DO 10 I=1,NLJ
            READ(5,*)N5J(I),N5K(I),NREP(I),MREP(I),LREP(I),
     *               ALJ(I),BLJ(I),CLJ(I)
            WRITE(6,831)N5J(I),N5K(I),ALJ(I),BLJ(I),CLJ(I),
     *                  NREP(I),MREP(I),LREP(I)
         WRITE(6,815)
10    continue
C
          read(5,*)(q(i),i=1,i3n)

      do 30 i=1,i3n
        print*,'q (',i,') = ',q(i)
30    continue

      DO 40 MN=1,i3n
         PDOT(MN)=0.0
40    continue
c setting zero values for pdot, which is force
      IF (NLJ.NE.0) THEN
         CALL LENJ(1,NLJ)
      ENDIF


      end


      SUBROUTINE LENJ(INL,LNL)
      IMPLICIT REAL (A-H,O-Z)
      parameter (nda=3,nda3=nda*3,ND05=3)
C         CALCULATE LENNARD-JONES POTENTIAL ENERGY DERIVATIVES
C

      COMMON/QPDOT/Q(NDA3),PDOT(NDA3)
      COMMON/COORS/R(NDA*(NDA+1)/2)
      COMMON/LENJB/ALJ(ND05),BLJ(ND05),CLJ(ND05),N5J(ND05),N5K(ND05),
     *NREP(ND05),MREP(ND05),LREP(ND05)
      DIMENSION JKA(ND05),RNA(ND05),RMB(ND05),RLC(ND05)
      LOGICAL FIRST
      DATA FIRST/.TRUE./
      save FIRST,JKA,RNA,RMB,RLC
      common/ind/natoms,i3n


      IF (FIRST) THEN
         DO NL=INL,LNL
            JKA(NL)=ISHFT((N5J(NL)-1)*(2*NATOMS-N5J(NL)),-1)+N5K(NL)
     *             -N5J(NL)
      c to avoid recounting
            RNA(NL)=-NREP(NL)*ALJ(NL)
            RMB(NL)=-MREP(NL)*BLJ(NL)
            RLC(NL)=-LREP(NL)*CLJ(NL)
         ENDDO
C
         FIRST=.FALSE.
      ENDIF
C
C         CODE FOR GENERAL LENNARD-JONES
C
      DO NL=INL,LNL
         J3=3*N5J(NL)
         J2=J3-1
         J1=J2-1
         K3=3*N5K(NL)
         K2=K3-1
         K1=K2-1
      c since each atom is represented by 3 coordinates in space, indices above are used to point for a step of three.
         JK=JKA(NL)
         T1=Q(K1)-Q(J1)
         T2=Q(K2)-Q(J2)
         T3=Q(K3)-Q(J3)
         R(JK)=SQRT(T1*T1+T2*T2+T3*T3)
       c calculating the distance
         RRJK=1.0/R(JK)
         DUM1=RNA(NL)*RRJK**(2+NREP(NL))
         DUM1=DUM1+RMB(NL)*RRJK**(MREP(NL)+2)
         DUM1=DUM1+RLC(NL)*RRJK**(LREP(NL)+2)
       c adding each terms
         TDUM1=DUM1*T1
         TDUM2=DUM1*T2
         TDUM3=DUM1*T3
         PDOT(K1)=PDOT(K1)+TDUM1
         PDOT(K2)=PDOT(K2)+TDUM2
         PDOT(K3)=PDOT(K3)+TDUM3
         PDOT(J1)=PDOT(J1)-TDUM1
         PDOT(J2)=PDOT(J2)-TDUM2
         PDOT(J3)=PDOT(J3)-TDUM3
       c final forces.
      ENDDO
      print*,'...............'
      do i=1,i3n
        print*,'i = ',i ,', dvdq = ',pdot(i)
      enddo
C   N5J(I), N5K(I), NREP(I), MREP(I), LREP(I), ALJ(I), BLJ(I) and CLJ(I) for I = 1, NLJ.
C   This is information for the generalized Lennard-Jones potentials.
C   N5J and N5K are the atoms with N5K greater than N5J.
C   NREP, MREP, and LREP are the powers in the potential energy expression.
C   If a power equals zero the appropriate term is skipped in the calculations.
      return
      END

C++ CODE

#include<iostream>
#include<fstream>
#include<math.h>
#include<stdio.h>
#include<stdlib.h>
using namespace std;

void lenjones(int NATOMS, int* N5J, int* N5K, int* NREP, int* MREP, int* LREP, float* ALJ, float* BLJ, float* CLJ, int NLJ,float* Q, float* PDOT){

size_t NLJF = NLJ*sizeof(float);
size_t NLJI = NLJ*sizeof(float);

float* RMB = (float*)malloc(NLJF);
float* RLC = (float*)malloc(NLJF);
float* RNA = (float*)malloc(NLJF);
int *JKA=(int*)malloc(NLJI);
for (int NL=0; NL < NLJ; NL++){
    JKA[NL]= (((N5J[NL]-1)*((2*NATOMS)-N5J[NL]))>> 1) +N5K[NL]-N5J[NL];
    RNA[NL]=-NREP[NL]*ALJ[NL];
    RMB[NL]=-MREP[NL]*BLJ[NL];
    RLC[NL]=-LREP[NL]*CLJ[NL];
}

int J3,K3;
float T1,T2,T3;
float TDUM1,TDUM2,TDUM3,DUM;
size_t RRS = (NATOMS*(NATOMS+1)/2)*sizeof(float);
float* RR = (float*)malloc(RRS);
int JK;
for (int NL=0; NL < NLJ; NL++){
    J3=(3*N5J[NL])-1;
    K3=(3*N5K[NL])-1;
    T1=Q[K3-2]-Q[J3-2];
    T2=Q[K3-1]-Q[J3-1];
    T3=Q[K3]-Q[J3];
    JK=JKA[NL]-1;
    RR[JK]=sqrtf((T1*T1)+(T2*T2)+(T3*T3));
    RR[JK]=1/RR[JK];
    DUM=(RNA[NL]*powf(RR[JK],2+NREP[NL]));
    DUM+=(RMB[NL]*powf(RR[JK],MREP[NL]+2));
    DUM+=(RLC[NL]*powf(RR[JK],LREP[NL]+2));
    TDUM1=T1*DUM;
    TDUM2=T2*DUM;
    TDUM3=T3*DUM;
    PDOT[K3-2]=PDOT[K3-2]+TDUM1;
    PDOT[K3-1]=PDOT[K3-1]+TDUM2;
    PDOT[K3]=PDOT[K3]+TDUM3;
    PDOT[J3-2]=PDOT[J3-2]-TDUM1;
    PDOT[J3-1]=PDOT[J3-1]-TDUM2;
    PDOT[J3]=PDOT[J3]-TDUM3;
}

}


//=========================================
int main(){

int NATOMS,NDA3,ND05;
scanf("%d %d", &NATOMS, &ND05);
printf("\n");
NDA3=3*NATOMS;
size_t QPDOT = NDA3*sizeof(float);
float* h_Q = (float*)malloc(QPDOT);
float* h_PDOT = (float*)malloc(QPDOT);

size_t NLJ = ND05*sizeof(float);

float* h_ALJ = (float*)malloc(NLJ);
float* h_BLJ = (float*)malloc(NLJ);
float* h_CLJ = (float*)malloc(NLJ);

size_t NLJ_i = ND05*sizeof(int);
int* h_LREP = (int*)malloc(NLJ_i);
int* h_MREP = (int*)malloc(NLJ_i);
int* h_NREP = (int*)malloc(NLJ_i);
int* h_N5J = (int*)malloc(NLJ_i);
int* h_N5K = (int*)malloc(NLJ_i);
int* h_JKA = (int*)malloc(NLJ_i);

for (int i=0; i< ND05; i++){
    cin >> h_N5J[i] >> h_N5K[i] >> h_NREP[i] >> h_MREP[i] >> h_LREP[i] >> h_ALJ[i] >> h_BLJ[i] >> h_CLJ[i];
}
for (int i=0; i<NDA3; i++){
    scanf("%f", &h_Q[i]);
}
for (int i=0; i<NDA3; i++){
    h_PDOT[i]=0;
}

lenjones(NATOMS, &h_N5J[0], &h_N5K[0], &h_NREP[0], &h_MREP[0], &h_LREP[0], &h_ALJ[0], &h_BLJ[0], &h_CLJ[0], ND05, &h_Q[0], &h_PDOT[0]);

cout << "i  " << "Q[i]  " << "PDOT[i]" << endl;
for (int i=0; i<NDA3; i++){
    printf("%d  %e  %le \n" , i, h_Q[i], h_PDOT[i]);
}

}

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

旧时光的容颜 2024-11-22 07:57:29

如果您想要更好的数学精度,您应该在 C++ 中使用 double 而不是 float。不知道 Fortran 给你的精度是多少(很好,我猜是因为它广泛用于计算),但是 float 对于 C++ 数学来说是不精确的。

检查您的 C++/CUDA 编译器选项以获得最佳数学精度,例如。 适用于 Visual C++

If you want better math precision you should be using double not float in C++. Don't know what precision Fortran gives you (good, I am guessing since it's widely-used for calculation afaik), but float is imprecise for C++ math.

Check your C++/CUDA compiler options for best math precision also, eg. for Visual C++

挽清梦 2024-11-22 07:57:29

对于相同的输入,输出中间结果并追踪它们从何时开始出现分歧。

For the same inputs, output intermediate results and track down at which point do they start diverging.

沫离伤花 2024-11-22 07:57:29

我注意到 FORTRAN 和 C++ 代码之间很可能会引入数值差异:

  • FORTRAN 的 power (**) 与 C++
  • FORTRAN 的 sqrt< 中的 powf /code> 与 sqrt 在您的 C++
  • 编译器中选择 FORTRAN 和 C++ 之间的 SSE(您应该禁用它们两者的优化,然后检查数值差异)
  • 编译器选择浮点模式(快速与准确)
  • FORTRAN 选项可模拟一些较旧的浮点模式(VAX 浮点在非正常情况下会截断为 0)

Some things I noticed between your FORTRAN and C++ code that are very likely to introduce numerical differences:

  • FORTRAN's power (**) versus powf in your C++
  • FORTRAN's sqrt versus sqrt in your C++
  • Compiler selection of SSE between the FORTRAN and C++ (you should disable optimization on both of them and check the numerical differencs then)
  • Compiler selection of floating point mode (fast vs accurate)
  • FORTRAN option to emulate some of the older floating point modes (VAX floats come to mind with truncate towards 0 in denormal situations)
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文