OpenMP 和归约()

发布于 2024-11-01 09:36:47 字数 3528 浏览 1 评论 0原文

我只有 3 个函数,一个是控制函数,接下来的 2 个函数是使用 OpenMP 以稍微不同的方式完成的。但是函数 thread1 给出的分数比 thread2 和 control 的分数高,我不知道为什么?

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

float function(float x){
    return pow(x,pow(x,sin(x)));
}



 float integrate(float begin, float end, int count){
    float score = 0 , width = (end-begin)/(1.0*count), i=begin, y1, y2;


    for(i = 0; i<count; i++){
            score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
 }
    return score; 
 }




 float thread1(float begin, float end, int count){
    float score = 0 , width = (end-begin)/(1.0*count), y1, y2;

    int i;
    #pragma omp parallel for reduction(+:score) private(y1,i) shared(count)
    for(i = 0; i<count; i++){
        y1 = ((function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0);
       score = score + y1;
    }

    return score;
 }


 float thread2(float begin, float end, int count){
    float score = 0 , width = (end-begin)/(1.0*count), y1, y2;

    int i;
    float * tab = (float*)malloc(count * sizeof(float));

    #pragma omp parallel for
    for(i = 0; i<count; i++){
            tab[i] = (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    for(i=0; i<count; i++)
            score += tab[i];
    return score;
  }


  unsigned long long int rdtsc(void){
     unsigned long long int x;
     unsigned a, d;

    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));

    return ((unsigned long long)a) | (((unsigned long long)d) << 32);
   }






   int main(int argc, char** argv){
        unsigned long long counter = 0;


    //test
       counter = rdtsc();
       printf("control: %f \n ",integrate (atof(argv[1]), atof(argv[2]), atoi(argv[3])));
       printf("control count: %lld \n",rdtsc()-counter);
        counter = rdtsc();
       printf("thread1: %f \n ",thread1(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
       printf("thread1 count: %lld \n",rdtsc()-counter);
        counter = rdtsc();
       printf("thread2: %f \n ",thread2(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
       printf("thread2 count: %lld \n",rdtsc()-counter);

       return 0;
      }

以下是简单的回答:

 gcc -fopenmp zad2.c -o zad -pg -lm
 env OMP_NUM_THREADS=2 ./zad 3 13 100000
 control: 5407308.500000 
 control count: 138308058 
 thread1: 5407494.000000 
 thread1 count: 96525618 
 thread2: 5407308.500000 
 thread2 count: 104770859

更新:

好的,我尝试更快地完成此操作,并且不对周期值进行两次计数。

double thread3(double begin, double end, int count){
     double score = 0 , width = (end-begin)/(1.0*count), yp, yk;    
     int i,j, k;

     #pragma omp parallel private (yp,yk) 
     {
       int thread_num = omp_get_num_threads();
       k = count / thread_num;

    #pragma omp for private(i) reduction(+:score) 
    for(i=0; i<thread_num; i++){
        yp = function(begin + i*k*width);
        yk = function(begin + (i*k+1)*width);
        score += (yp + yk) * width / 2.0;
        for(j=i*k +1; j<(i+1)*k; j++){
            yp = yk;
            yk = function(begin + (j+1)*width);
            score  += (yp + yk) * width / 2.0;
        }
    }

  #pragma omp for private(i) reduction(+:score) 
  for(i = k*thread_num; i<count; i++)
    score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
 }  
   return score;
 }

但经过几次测试,我发现分数接近正确值,但不相等。有时其中一个线程无法启动。当我不使用 OpenMp 时,该值是正确的。

I've got simply 3 functions, one is control function aan the next 2 function are done in a bit different way using OpenMP. But function thread1 gives another score than thread2 and control and I have no idea why?

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

float function(float x){
    return pow(x,pow(x,sin(x)));
}



 float integrate(float begin, float end, int count){
    float score = 0 , width = (end-begin)/(1.0*count), i=begin, y1, y2;


    for(i = 0; i<count; i++){
            score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
 }
    return score; 
 }




 float thread1(float begin, float end, int count){
    float score = 0 , width = (end-begin)/(1.0*count), y1, y2;

    int i;
    #pragma omp parallel for reduction(+:score) private(y1,i) shared(count)
    for(i = 0; i<count; i++){
        y1 = ((function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0);
       score = score + y1;
    }

    return score;
 }


 float thread2(float begin, float end, int count){
    float score = 0 , width = (end-begin)/(1.0*count), y1, y2;

    int i;
    float * tab = (float*)malloc(count * sizeof(float));

    #pragma omp parallel for
    for(i = 0; i<count; i++){
            tab[i] = (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    for(i=0; i<count; i++)
            score += tab[i];
    return score;
  }


  unsigned long long int rdtsc(void){
     unsigned long long int x;
     unsigned a, d;

    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));

    return ((unsigned long long)a) | (((unsigned long long)d) << 32);
   }






   int main(int argc, char** argv){
        unsigned long long counter = 0;


    //test
       counter = rdtsc();
       printf("control: %f \n ",integrate (atof(argv[1]), atof(argv[2]), atoi(argv[3])));
       printf("control count: %lld \n",rdtsc()-counter);
        counter = rdtsc();
       printf("thread1: %f \n ",thread1(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
       printf("thread1 count: %lld \n",rdtsc()-counter);
        counter = rdtsc();
       printf("thread2: %f \n ",thread2(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
       printf("thread2 count: %lld \n",rdtsc()-counter);

       return 0;
      }

Here are simple answears :

 gcc -fopenmp zad2.c -o zad -pg -lm
 env OMP_NUM_THREADS=2 ./zad 3 13 100000
 control: 5407308.500000 
 control count: 138308058 
 thread1: 5407494.000000 
 thread1 count: 96525618 
 thread2: 5407308.500000 
 thread2 count: 104770859

Update:

Ok, I tried to do this more quickly, and not count values for periods twice.

double thread3(double begin, double end, int count){
     double score = 0 , width = (end-begin)/(1.0*count), yp, yk;    
     int i,j, k;

     #pragma omp parallel private (yp,yk) 
     {
       int thread_num = omp_get_num_threads();
       k = count / thread_num;

    #pragma omp for private(i) reduction(+:score) 
    for(i=0; i<thread_num; i++){
        yp = function(begin + i*k*width);
        yk = function(begin + (i*k+1)*width);
        score += (yp + yk) * width / 2.0;
        for(j=i*k +1; j<(i+1)*k; j++){
            yp = yk;
            yk = function(begin + (j+1)*width);
            score  += (yp + yk) * width / 2.0;
        }
    }

  #pragma omp for private(i) reduction(+:score) 
  for(i = k*thread_num; i<count; i++)
    score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
 }  
   return score;
 }

But after few tests I found that the scores are near the right value, but not equal. Sometimes one of the threads doesn't start. When I'm not using OpenMp, the value is correct.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

剩余の解释 2024-11-08 09:36:47

您正在对一个非常强的峰值函数进行积分 - x(xsin(x)) - 它在您积分的范围内覆盖了超过 7 个数量级。这大约是 32 位浮点数的限制,因此根据对数字求和的顺序,将会出现问题。这不是 OpenMP 的问题——它只是数字敏感性的问题。

积分函数

例如,考虑这个完全串行的代码执行相同的积分:

#include <stdio.h>
#include <math.h>

float function(float x){
    return pow(x,pow(x,sin(x)));
}

int main(int argc, char **argv) {

    const float begin=3., end=13.;
    const int count = 100000;
    const float width=(end-begin)/(1.*count);

    float integral1=0., integral2=0., integral3=0.;

    /* left to right */
    for (int i=0; i<count; i++) {
         integral1 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    /* right to left */
    for (int i=count-1; i>=0; i--) {
         integral2 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    /* centre outwards, first right-to-left, then left-to-right */
    for (int i=count/2; i<count; i++) {
         integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }
    for (int i=count/2-1; i>=0; i--) {
         integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    printf("Left to right: %lf\n", integral1);
    printf("Right to left: %lf\n", integral2);
    printf("Centre outwards: %lf\n", integral3);

    return 0;
}

运行这个,我们得到:

$ ./reduce
Left to right: 5407308.500000
Right to left: 5407430.000000
Centre outwards: 5407335.500000

--你看到的同样的差异。使用两个线程进行求和必然会改变求和的顺序,因此您的答案会发生变化。

这里有几个选项。如果这只是一个测试问题,并且这个函数实际上并不代表您将要集成的内容,那么您可能已经没问题了。否则,使用不同的数值方法可能会有所帮助。

但这里也有一个简单的解决方案 - 数字的范围超出了 float 的范围,使得答案对求和顺序非常敏感,但在 double 的范围内很合适,使问题变得不那么严重。请注意,更改为 double 并不是解决所有问题的神奇方法。在某些情况下,它只是推迟问题的解决或让您掩盖数值方法中的缺陷。但在这里它实际上很好地解决了根本问题。将上面的所有 float 更改为 double 给出:

$ ./reduce
Left to right: 5407589.272885
Right to left: 5407589.272885
Centre outwards: 5407589.272885

另一方面,如果您需要将此函数集成到范围 (18, 23)。

You're integrating a very strongly peaked function - x(xsin(x)) - which covers over 7 orders of magnitude in the range you're integrating it. That's about the limit for a 32-bit floating point number, so there are going to be issues depending on the order you sum the numbers. This isn't an OpenMP thing -- its just a numerical sensitivity thing.

The integrated function

So for instance, consider this completely serial code doing the same integral:

#include <stdio.h>
#include <math.h>

float function(float x){
    return pow(x,pow(x,sin(x)));
}

int main(int argc, char **argv) {

    const float begin=3., end=13.;
    const int count = 100000;
    const float width=(end-begin)/(1.*count);

    float integral1=0., integral2=0., integral3=0.;

    /* left to right */
    for (int i=0; i<count; i++) {
         integral1 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    /* right to left */
    for (int i=count-1; i>=0; i--) {
         integral2 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    /* centre outwards, first right-to-left, then left-to-right */
    for (int i=count/2; i<count; i++) {
         integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }
    for (int i=count/2-1; i>=0; i--) {
         integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    printf("Left to right: %lf\n", integral1);
    printf("Right to left: %lf\n", integral2);
    printf("Centre outwards: %lf\n", integral3);

    return 0;
}

Running this, we get:

$ ./reduce
Left to right: 5407308.500000
Right to left: 5407430.000000
Centre outwards: 5407335.500000

-- the same sort of differences you see. Doing the summation with two threads necessarily changes the order of the summation, and so your answer changes.

There's a few options here. If this was just a test proble, and this function doesn't actually represent what you'll be integrating, you might be fine already. Otherwise, using a different numerical method may help.

But also here, there is a simple solution - the range of the numbers exceeds the range of a float, making the answer very sensitive to summation order, but fits comfortably within the range of a double, making the problem much less severe. Note that changing to doubles is not a magic solution to everything; some cases it just postpones the problem or allows you to paper over a flaw in your numerical method. But here it actually addresses the underlying problem fairly well. Changing all the floats above to doubles gives:

$ ./reduce
Left to right: 5407589.272885
Right to left: 5407589.272885
Centre outwards: 5407589.272885

On the other hand, even doubles wouldn't save you if you needed to integrate this function in the range (18,23).

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文