OpenMP 和归约()
我只有 3 个函数,一个是控制函数,接下来的 2 个函数是使用 OpenMP 以稍微不同的方式完成的。但是函数 thread1 给出的分数比 thread2 和 control 的分数高,我不知道为什么?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
float function(float x){
return pow(x,pow(x,sin(x)));
}
float integrate(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), i=begin, y1, y2;
for(i = 0; i<count; i++){
score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
return score;
}
float thread1(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), y1, y2;
int i;
#pragma omp parallel for reduction(+:score) private(y1,i) shared(count)
for(i = 0; i<count; i++){
y1 = ((function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0);
score = score + y1;
}
return score;
}
float thread2(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), y1, y2;
int i;
float * tab = (float*)malloc(count * sizeof(float));
#pragma omp parallel for
for(i = 0; i<count; i++){
tab[i] = (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
for(i=0; i<count; i++)
score += tab[i];
return score;
}
unsigned long long int rdtsc(void){
unsigned long long int x;
unsigned a, d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long long)a) | (((unsigned long long)d) << 32);
}
int main(int argc, char** argv){
unsigned long long counter = 0;
//test
counter = rdtsc();
printf("control: %f \n ",integrate (atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("control count: %lld \n",rdtsc()-counter);
counter = rdtsc();
printf("thread1: %f \n ",thread1(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("thread1 count: %lld \n",rdtsc()-counter);
counter = rdtsc();
printf("thread2: %f \n ",thread2(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("thread2 count: %lld \n",rdtsc()-counter);
return 0;
}
以下是简单的回答:
gcc -fopenmp zad2.c -o zad -pg -lm
env OMP_NUM_THREADS=2 ./zad 3 13 100000
control: 5407308.500000
control count: 138308058
thread1: 5407494.000000
thread1 count: 96525618
thread2: 5407308.500000
thread2 count: 104770859
更新:
好的,我尝试更快地完成此操作,并且不对周期值进行两次计数。
double thread3(double begin, double end, int count){
double score = 0 , width = (end-begin)/(1.0*count), yp, yk;
int i,j, k;
#pragma omp parallel private (yp,yk)
{
int thread_num = omp_get_num_threads();
k = count / thread_num;
#pragma omp for private(i) reduction(+:score)
for(i=0; i<thread_num; i++){
yp = function(begin + i*k*width);
yk = function(begin + (i*k+1)*width);
score += (yp + yk) * width / 2.0;
for(j=i*k +1; j<(i+1)*k; j++){
yp = yk;
yk = function(begin + (j+1)*width);
score += (yp + yk) * width / 2.0;
}
}
#pragma omp for private(i) reduction(+:score)
for(i = k*thread_num; i<count; i++)
score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
return score;
}
但经过几次测试,我发现分数接近正确值,但不相等。有时其中一个线程无法启动。当我不使用 OpenMp 时,该值是正确的。
I've got simply 3 functions, one is control function aan the next 2 function are done in a bit different way using OpenMP. But function thread1 gives another score than thread2 and control and I have no idea why?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
float function(float x){
return pow(x,pow(x,sin(x)));
}
float integrate(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), i=begin, y1, y2;
for(i = 0; i<count; i++){
score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
return score;
}
float thread1(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), y1, y2;
int i;
#pragma omp parallel for reduction(+:score) private(y1,i) shared(count)
for(i = 0; i<count; i++){
y1 = ((function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0);
score = score + y1;
}
return score;
}
float thread2(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), y1, y2;
int i;
float * tab = (float*)malloc(count * sizeof(float));
#pragma omp parallel for
for(i = 0; i<count; i++){
tab[i] = (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
for(i=0; i<count; i++)
score += tab[i];
return score;
}
unsigned long long int rdtsc(void){
unsigned long long int x;
unsigned a, d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long long)a) | (((unsigned long long)d) << 32);
}
int main(int argc, char** argv){
unsigned long long counter = 0;
//test
counter = rdtsc();
printf("control: %f \n ",integrate (atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("control count: %lld \n",rdtsc()-counter);
counter = rdtsc();
printf("thread1: %f \n ",thread1(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("thread1 count: %lld \n",rdtsc()-counter);
counter = rdtsc();
printf("thread2: %f \n ",thread2(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("thread2 count: %lld \n",rdtsc()-counter);
return 0;
}
Here are simple answears :
gcc -fopenmp zad2.c -o zad -pg -lm
env OMP_NUM_THREADS=2 ./zad 3 13 100000
control: 5407308.500000
control count: 138308058
thread1: 5407494.000000
thread1 count: 96525618
thread2: 5407308.500000
thread2 count: 104770859
Update:
Ok, I tried to do this more quickly, and not count values for periods twice.
double thread3(double begin, double end, int count){
double score = 0 , width = (end-begin)/(1.0*count), yp, yk;
int i,j, k;
#pragma omp parallel private (yp,yk)
{
int thread_num = omp_get_num_threads();
k = count / thread_num;
#pragma omp for private(i) reduction(+:score)
for(i=0; i<thread_num; i++){
yp = function(begin + i*k*width);
yk = function(begin + (i*k+1)*width);
score += (yp + yk) * width / 2.0;
for(j=i*k +1; j<(i+1)*k; j++){
yp = yk;
yk = function(begin + (j+1)*width);
score += (yp + yk) * width / 2.0;
}
}
#pragma omp for private(i) reduction(+:score)
for(i = k*thread_num; i<count; i++)
score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
return score;
}
But after few tests I found that the scores are near the right value, but not equal. Sometimes one of the threads doesn't start. When I'm not using OpenMp, the value is correct.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
您正在对一个非常强的峰值函数进行积分 - x(xsin(x)) - 它在您积分的范围内覆盖了超过 7 个数量级。这大约是 32 位浮点数的限制,因此根据对数字求和的顺序,将会出现问题。这不是 OpenMP 的问题——它只是数字敏感性的问题。
例如,考虑这个完全串行的代码执行相同的积分:
运行这个,我们得到:
--你看到的同样的差异。使用两个线程进行求和必然会改变求和的顺序,因此您的答案会发生变化。
这里有几个选项。如果这只是一个测试问题,并且这个函数实际上并不代表您将要集成的内容,那么您可能已经没问题了。否则,使用不同的数值方法可能会有所帮助。
但这里也有一个简单的解决方案 - 数字的范围超出了 float 的范围,使得答案对求和顺序非常敏感,但在 double 的范围内很合适,使问题变得不那么严重。请注意,更改为 double 并不是解决所有问题的神奇方法。在某些情况下,它只是推迟问题的解决或让您掩盖数值方法中的缺陷。但在这里它实际上很好地解决了根本问题。将上面的所有
float
更改为double
给出:另一方面,如果您需要将此函数集成到范围 (18, 23)。
You're integrating a very strongly peaked function - x(xsin(x)) - which covers over 7 orders of magnitude in the range you're integrating it. That's about the limit for a 32-bit floating point number, so there are going to be issues depending on the order you sum the numbers. This isn't an OpenMP thing -- its just a numerical sensitivity thing.
So for instance, consider this completely serial code doing the same integral:
Running this, we get:
-- the same sort of differences you see. Doing the summation with two threads necessarily changes the order of the summation, and so your answer changes.
There's a few options here. If this was just a test proble, and this function doesn't actually represent what you'll be integrating, you might be fine already. Otherwise, using a different numerical method may help.
But also here, there is a simple solution - the range of the numbers exceeds the range of a
float
, making the answer very sensitive to summation order, but fits comfortably within the range of adouble
, making the problem much less severe. Note that changing todouble
s is not a magic solution to everything; some cases it just postpones the problem or allows you to paper over a flaw in your numerical method. But here it actually addresses the underlying problem fairly well. Changing all thefloat
s above todouble
s gives:On the other hand, even doubles wouldn't save you if you needed to integrate this function in the range (18,23).