Openmp 基本并行化

发布于 2024-10-17 09:18:51 字数 2544 浏览 0 评论 0原文

我在并发课程中使用 OpenMP 编写一些并行 C 代码时陷入困境。

这是一个片段

#include <stdio.h>
#include <time.h>
#include <math.h>

#define FALSE 0
#define TRUE 1

int count_primes_0(int);
int count_primes_1(int);
int count_primes_2(int);

int main(int argc, char *argv[]){
    int n;

    if (argc != 2){
        printf("Incorrect Invocation, use: \nq1 N");
        return 0;
    } else {
        n = atoi(argv[1]);  
    }

    if (n < 0){
        printf("N cannot be negative");
        return 0;
    }

    printf("N = %d\n", n);

    //omp_set_num_threads(1);
    time_it(count_primes_0, n, "Method 0");
    time_it(count_primes_1, n, "Method 1");
    time_it(count_primes_2, n, "Method 2");

    return 0;
}

int is_prime(int n){
    for(int i = 2; i <= (int)(sqrt((double) n)); i++){
        if ((n % i) == 0){
            return FALSE;
        }
    }

    return n > 1;
}

void time_it( int (*f)(int), int n, char *string){
    clock_t start_clock;
    clock_t end_clock;
    double calc_time;
    int nprimes;

    struct timeval start_val;
    struct timeval end_val;

    start_clock = clock();
    nprimes = (*f)(n);
    end_clock = clock();
    calc_time = ((double)end_clock - (double)start_clock) / CLOCKS_PER_SEC;
    printf("\tNumber of primes: %d \t Time taken: %fs\n\n", nprimes, calc_time);
}

// METHOD 0
// Base Case no parallelization
int count_primes_0(int n){
    int nprimes = 0;

    for(int i = 1; i <= n; i++){
        if (is_prime(i)) {
            nprimes++;
        }
    }

    return nprimes;
}

//METHOD 1
// Use only For and Critical Constructs
int count_primes_1(int n){
    int nprimes = 0;

    #pragma omp parallel for
    for(int i = 1; i <= n; i++){
        if (is_prime(i)) {
            #pragma omp critical
            nprimes++;
        }
    }

    return nprimes;
}

//METHOD 2
// Use Reduction
int count_primes_2(int n){
    int nprimes = 0;

    #pragma omp parallel for reduction(+:nprimes)
    for(int i = 1; i <= n; i++){
        if (is_prime(i)) {
           nprimes++;
        }
    }

    return nprimes;
}

我面临的问题是，当我使用 omp_set_num_threads() 时，我使用的线程更少我的函数运行得越快——或者更接近基本非并行案例的运行时间

时间结果：这些在 8 核机器 8 线程上运行

：方法0：0.07s；方法一：1.63s；方法2：1.4s

4线程：方法0：0.07s；方法一：0.16s；方法2：0.16s

2个线程：方法0：0.07s；方法一：0.10；方法 2：0.09

1 线程：方法0：0.07s；方法一：0.08s；方法 2：0.07s

我尝试禁用优化并使用不同的 gcc 版本，没有区别

任何帮助都是值得赞赏的。

编辑：在Linux中使用时钟会返回“不正确”的时间，挂钟时间正是我所需要的，因此使用ether omp_get_wtime()或Linux函数timeit会产生正确的结果。

原文

I've gotten stuck writing some parallel c code using OpenMP for a concurrency course.

Heres a snippet

#include <stdio.h>
#include <time.h>
#include <math.h>

#define FALSE 0
#define TRUE 1

int count_primes_0(int);
int count_primes_1(int);
int count_primes_2(int);

int main(int argc, char *argv[]){
    int n;

    if (argc != 2){
        printf("Incorrect Invocation, use: \nq1 N");
        return 0;
    } else {
        n = atoi(argv[1]);  
    }

    if (n < 0){
        printf("N cannot be negative");
        return 0;
    }

    printf("N = %d\n", n);

    //omp_set_num_threads(1);
    time_it(count_primes_0, n, "Method 0");
    time_it(count_primes_1, n, "Method 1");
    time_it(count_primes_2, n, "Method 2");

    return 0;
}

int is_prime(int n){
    for(int i = 2; i <= (int)(sqrt((double) n)); i++){
        if ((n % i) == 0){
            return FALSE;
        }
    }

    return n > 1;
}

void time_it( int (*f)(int), int n, char *string){
    clock_t start_clock;
    clock_t end_clock;
    double calc_time;
    int nprimes;

    struct timeval start_val;
    struct timeval end_val;

    start_clock = clock();
    nprimes = (*f)(n);
    end_clock = clock();
    calc_time = ((double)end_clock - (double)start_clock) / CLOCKS_PER_SEC;
    printf("\tNumber of primes: %d \t Time taken: %fs\n\n", nprimes, calc_time);
}

// METHOD 0
// Base Case no parallelization
int count_primes_0(int n){
    int nprimes = 0;

    for(int i = 1; i <= n; i++){
        if (is_prime(i)) {
            nprimes++;
        }
    }

    return nprimes;
}

//METHOD 1
// Use only For and Critical Constructs
int count_primes_1(int n){
    int nprimes = 0;

    #pragma omp parallel for
    for(int i = 1; i <= n; i++){
        if (is_prime(i)) {
            #pragma omp critical
            nprimes++;
        }
    }

    return nprimes;
}

//METHOD 2
// Use Reduction
int count_primes_2(int n){
    int nprimes = 0;

    #pragma omp parallel for reduction(+:nprimes)
    for(int i = 1; i <= n; i++){
        if (is_prime(i)) {
           nprimes++;
        }
    }

    return nprimes;
}

The problem I'm facing is that when I use omp_set_num_threads() the less threads I use
the faster my functions run -- or get closer to the runtime of the base unparallelized case

Time Results:
These are run on an 8 core machine

8 Threads:
Method 0: 0.07s; Method 1: 1.63s; Method 2: 1.4s

4 Threads:
Method 0: 0.07s; Method 1: 0.16s; Method 2: 0.16s

2 Threads:
Method 0: 0.07s; Method 1: 0.10; Method 2: 0.09

1 Thread:
Method 0: 0.07s; Method 1: 0.08s; Method 2: 0.07s

I've tried disabling optimization and using a different gcc version with no difference

Any help is appreciated.

EDIT: Using clock in Linux returns the 'incorrect' time, wall clock time is what I needed so using ether omp_get_wtime() or the Linux function timeit would produce the proper results.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

凯凯我们等你回来 2024-10-24 09:18:51

我很惊讶您看到上述程序取得了成功。如果您查看 RedHat Linux 手册页中的 Clock()，您会发现它“返回程序使用的处理器时间的近似值”。放入 OpenMP 指令会导致更多开销，因此您应该会看到运行 OpenMP 时使用的总体处理器时间更多。您需要查看的是经过时间（或挂钟时间）。当您并行运行时（并且您有一个可以从并行中受益的程序），您将看到运行时间减少。 OpenMP 规范定义了一个例程 (omp_get_wtime()) 来提供此信息。

更改程序以使用clock()和omp_get_wtime()进行报告：

$ a.out 1000000 (1,000,000)

2个处理器：

clock(): 0.23 wtime(): 0.23 Clock(): 0.96 wtime(): 0.16 Clock(): 0.59 wtime(): 0.09

4 个处理器:

时钟(): 0.24 wtime(): 0.24 时钟(): 0.97 wtime(): 0.16 时钟(): 0.57 wtime(): 0.09

8 个处理器:

时钟(): 0.24 wtime(): 0.24 时钟(): 2.60 wtime(): 0.26 时钟(): 0.64 wtime(): 0.09

$ a.out 10000000 (10,000,000)

2 个处理器:

时钟(): 6.07 wtime(): 6.07 时钟(): 10.4 wtime() ： 1.78 时钟（）： 11.3 wtime（）： 1.65

4 个处理器：