Openmp 基本并行化
我在并发课程中使用 OpenMP 编写一些并行 C 代码时陷入困境。
这是一个片段
#include <stdio.h>
#include <time.h>
#include <math.h>
#define FALSE 0
#define TRUE 1
int count_primes_0(int);
int count_primes_1(int);
int count_primes_2(int);
int main(int argc, char *argv[]){
int n;
if (argc != 2){
printf("Incorrect Invocation, use: \nq1 N");
return 0;
} else {
n = atoi(argv[1]);
}
if (n < 0){
printf("N cannot be negative");
return 0;
}
printf("N = %d\n", n);
//omp_set_num_threads(1);
time_it(count_primes_0, n, "Method 0");
time_it(count_primes_1, n, "Method 1");
time_it(count_primes_2, n, "Method 2");
return 0;
}
int is_prime(int n){
for(int i = 2; i <= (int)(sqrt((double) n)); i++){
if ((n % i) == 0){
return FALSE;
}
}
return n > 1;
}
void time_it( int (*f)(int), int n, char *string){
clock_t start_clock;
clock_t end_clock;
double calc_time;
int nprimes;
struct timeval start_val;
struct timeval end_val;
start_clock = clock();
nprimes = (*f)(n);
end_clock = clock();
calc_time = ((double)end_clock - (double)start_clock) / CLOCKS_PER_SEC;
printf("\tNumber of primes: %d \t Time taken: %fs\n\n", nprimes, calc_time);
}
// METHOD 0
// Base Case no parallelization
int count_primes_0(int n){
int nprimes = 0;
for(int i = 1; i <= n; i++){
if (is_prime(i)) {
nprimes++;
}
}
return nprimes;
}
//METHOD 1
// Use only For and Critical Constructs
int count_primes_1(int n){
int nprimes = 0;
#pragma omp parallel for
for(int i = 1; i <= n; i++){
if (is_prime(i)) {
#pragma omp critical
nprimes++;
}
}
return nprimes;
}
//METHOD 2
// Use Reduction
int count_primes_2(int n){
int nprimes = 0;
#pragma omp parallel for reduction(+:nprimes)
for(int i = 1; i <= n; i++){
if (is_prime(i)) {
nprimes++;
}
}
return nprimes;
}
我面临的问题是,当我使用 omp_set_num_threads() 时,我使用的线程更少 我的函数运行得越快——或者更接近基本非并行案例的运行时间
时间结果: 这些在 8 核机器 8 线程上运行
: 方法0:0.07s;方法一:1.63s;方法2:1.4s
4线程: 方法0:0.07s;方法一:0.16s;方法2:0.16s
2个线程: 方法0:0.07s;方法一:0.10;方法 2:0.09
1 线程: 方法0:0.07s;方法一:0.08s;方法 2:0.07s
我尝试禁用优化并使用不同的 gcc 版本,没有区别
任何帮助都是值得赞赏的。
编辑:在Linux中使用时钟会返回“不正确”的时间,挂钟时间正是我所需要的,因此使用ether omp_get_wtime()或Linux函数timeit会产生正确的结果。
I've gotten stuck writing some parallel c code using OpenMP for a concurrency course.
Heres a snippet
#include <stdio.h>
#include <time.h>
#include <math.h>
#define FALSE 0
#define TRUE 1
int count_primes_0(int);
int count_primes_1(int);
int count_primes_2(int);
int main(int argc, char *argv[]){
int n;
if (argc != 2){
printf("Incorrect Invocation, use: \nq1 N");
return 0;
} else {
n = atoi(argv[1]);
}
if (n < 0){
printf("N cannot be negative");
return 0;
}
printf("N = %d\n", n);
//omp_set_num_threads(1);
time_it(count_primes_0, n, "Method 0");
time_it(count_primes_1, n, "Method 1");
time_it(count_primes_2, n, "Method 2");
return 0;
}
int is_prime(int n){
for(int i = 2; i <= (int)(sqrt((double) n)); i++){
if ((n % i) == 0){
return FALSE;
}
}
return n > 1;
}
void time_it( int (*f)(int), int n, char *string){
clock_t start_clock;
clock_t end_clock;
double calc_time;
int nprimes;
struct timeval start_val;
struct timeval end_val;
start_clock = clock();
nprimes = (*f)(n);
end_clock = clock();
calc_time = ((double)end_clock - (double)start_clock) / CLOCKS_PER_SEC;
printf("\tNumber of primes: %d \t Time taken: %fs\n\n", nprimes, calc_time);
}
// METHOD 0
// Base Case no parallelization
int count_primes_0(int n){
int nprimes = 0;
for(int i = 1; i <= n; i++){
if (is_prime(i)) {
nprimes++;
}
}
return nprimes;
}
//METHOD 1
// Use only For and Critical Constructs
int count_primes_1(int n){
int nprimes = 0;
#pragma omp parallel for
for(int i = 1; i <= n; i++){
if (is_prime(i)) {
#pragma omp critical
nprimes++;
}
}
return nprimes;
}
//METHOD 2
// Use Reduction
int count_primes_2(int n){
int nprimes = 0;
#pragma omp parallel for reduction(+:nprimes)
for(int i = 1; i <= n; i++){
if (is_prime(i)) {
nprimes++;
}
}
return nprimes;
}
The problem I'm facing is that when I use omp_set_num_threads() the less threads I use
the faster my functions run -- or get closer to the runtime of the base unparallelized case
Time Results:
These are run on an 8 core machine
8 Threads:
Method 0: 0.07s; Method 1: 1.63s; Method 2: 1.4s
4 Threads:
Method 0: 0.07s; Method 1: 0.16s; Method 2: 0.16s
2 Threads:
Method 0: 0.07s; Method 1: 0.10; Method 2: 0.09
1 Thread:
Method 0: 0.07s; Method 1: 0.08s; Method 2: 0.07s
I've tried disabling optimization and using a different gcc version with no difference
Any help is appreciated.
EDIT: Using clock in Linux returns the 'incorrect' time, wall clock time is what I needed so using ether omp_get_wtime() or the Linux function timeit would produce the proper results.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
我很惊讶您看到上述程序取得了成功。如果您查看 RedHat Linux 手册页中的 Clock(),您会发现它“返回程序使用的处理器时间的近似值”。放入 OpenMP 指令会导致更多开销,因此您应该会看到运行 OpenMP 时使用的总体处理器时间更多。您需要查看的是经过时间(或挂钟时间)。当您并行运行时(并且您有一个可以从并行中受益的程序),您将看到运行时间减少。 OpenMP 规范定义了一个例程 (omp_get_wtime()) 来提供此信息。
更改程序以使用clock()和omp_get_wtime()进行报告:
$ a.out 1000000 (1,000,000)
2个处理器:
clock(): 0.23 wtime(): 0.23 Clock(): 0.96 wtime(): 0.16 Clock(): 0.59 wtime(): 0.09
4 个处理器:
时钟(): 0.24 wtime(): 0.24 时钟(): 0.97 wtime(): 0.16 时钟(): 0.57 wtime(): 0.09
8 个处理器:
时钟(): 0.24 wtime(): 0.24 时钟(): 2.60 wtime(): 0.26 时钟(): 0.64 wtime(): 0.09
$ a.out 10000000 (10,000,000)
2 个处理器:
时钟(): 6.07 wtime(): 6.07 时钟(): 10.4 wtime() : 1.78 时钟(): 11.3 wtime(): 1.65
4 个处理器:
时钟(): 6.07 wtime(): 6.07 时钟(): 11.5 wtime(): 1.71 时钟(): 10.7 wtime(): 1.72
8 个处理器:
时钟( ):6.07 wtime():6.07 时钟():9.92 wtime():1.83 时钟():11.9 wtime():1.86
I am surprised that you have seen any success with the program as it is above. If you look at the RedHat Linux man page for clock(), you will see that it "returns an approximation of processor time used by the program". Putting in OpenMP directives causes more overhead, and thus you should see more overall processor time used when you run OpenMP. What you need to look at is elapse time (or wall clock time). When you run in parallel (and you have a program that can benefit from parallel), you will see the elapse time go down. The OpenMP specification defines a routine (omp_get_wtime()) to provide this information.
Changing your program to report using clock() and omp_get_wtime():
$ a.out 1000000 (1,000,000)
2 processors:
clock(): 0.23 wtime(): 0.23 clock(): 0.96 wtime(): 0.16 clock(): 0.59 wtime(): 0.09
4 processors:
clock(): 0.24 wtime(): 0.24 clock(): 0.97 wtime(): 0.16 clock(): 0.57 wtime(): 0.09
8 processors:
clock(): 0.24 wtime(): 0.24 clock(): 2.60 wtime(): 0.26 clock(): 0.64 wtime(): 0.09
$ a.out 10000000 (10,000,000)
2 processors:
clock(): 6.07 wtime(): 6.07 clock(): 10.4 wtime(): 1.78 clock(): 11.3 wtime(): 1.65
4 processors:
clock(): 6.07 wtime(): 6.07 clock(): 11.5 wtime(): 1.71 clock(): 10.7 wtime(): 1.72
8 processors:
clock(): 6.07 wtime(): 6.07 clock(): 9.92 wtime(): 1.83 clock(): 11.9 wtime(): 1.86
OpenMP 不会并行化循环及其内部的函数调用,除非参数是私有的。解决方案是在循环中内联
is_prime()
。OpenMP does not parallelize loops with function calls inside it, unless arguments are private. A solution would be to inline
is_prime()
in your loop.