了解跨NUMA节点的OpenMP程序的速度

发布于 2025-02-10 12:05:10 字数 6585 浏览 3 评论 0原文

我遇到了这种速度的行为，我发现很难解释。以下是背景：

高斯消除方法的程序

调用，以在循环中求解线性方程，以平行跨计算单元的工作负载。我们使用一个增强矩阵（M+1），其中一个额外的列可容纳RHS

HPC设置 - 带有Intel Xeon 6148 Gold的cray XC50节点，但具有以下配置，

available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
node 0 size: 95325 MB
node 0 free: 93811 MB
node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
node 1 size: 96760 MB
node 1 free: 96374 MB
node distances:
node   0   1
  0:  10  21
  1:  21  10

尽管不是实际的HPC，但块图和相关解释似乎完全适用（

通过APL提交的作业如

time aprun -n 1 -d 20 -j 1 -ss -cc 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 0,1,2,3,4,5,6,7,8,9,20,21,22,23,24,25,26,27,28,29 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 0,1,2,3,4,5,6,7,8,9,30,31,32,33,34,35,36,37,38,39 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 40,41,42,43,44,45,46,47,48,49,60,61,62,63,64,65,66,67,68,69 -e N=4000 -e M=200 -e MODE=2 ./gem

上述N所示，表示矩阵的数量，M替换了矩阵的尺寸。这些被作为环境变量传递到程序中并在内部使用。在讨论中可以忽略模式 CC列表专门列出了要绑定的CPU。 OMP_NUM_THREADS设置为20。目的是在20个计算单元上使用20个线程。

使用OMP_GET_WTIME（）在程序中记录并并行运行的时间，结果是以下

CPU绑定	客观	速度上升
0,1,2,3,4,5,6,7,8,9,1111， 12,13,14,15,16,17,18,19	插座上20个物理核的负载工作0	13.081944
0,1,2,2,3,4,5,6,7,7,8,9,20,21,22 ，23,24,25,26,27,28,29	分布在插座0＆amp;插座	18.33259
10,11,12,13,14,15,16,16,17,18,19,20,21,21,22,22,23,24,25,25,26,26,27,28,29	1 在插座0＆amp; 的前10个	18.636265
40,41,42,43,44,45,46,46,47,48,48,49,49,60,61,62,62,63,63,64,65,65,66,66668,68,68	插座1 插座（40-0，60-21）	15.922209

为什么使用插座0上所有物理节点时的第一种情况下的速度较小？这里的理解是，当任务分布在插座上时，UPI生效，并且应该较慢，而似乎恰恰相反。还可以解释使用虚拟核心时的最后情况。

注意：我们尝试了多次迭代，上述组合的结果非常一致。

edit1：

edit2：源代码

#define _GNU_SOURCE

#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include "sched.h"
#include "omp.h"

double drand(double low, double high, unsigned int *seed)
{
    return ((double)rand_r(seed) * (high - low)) / (double)RAND_MAX + low;
}

void init_vars(int *N, int *M, int *mode)
{
    const char *number_of_instances = getenv("N");
    if (number_of_instances) {
        *N = atoi(number_of_instances);
    }

    const char *matrix_dim = getenv("M");
    if (matrix_dim) {
        *M = atoi(matrix_dim);
    }

    const char *running_mode = getenv("MODE");
    if (running_mode) {
        *mode = atoi(running_mode);
    }
}

void print_matrix(double *instance, int M)
{
    for (int row = 0; row < M; row++) {
        for (int column = 0; column <= M; column++) {
            printf("%lf ", instance[row * (M + 1) + column]);
        }
        printf("\n");
    }
    printf("\n");
}

void swap(double *a, double *b)
{
    double temp = *a;
    *a = *b;
    *b = temp;
}

void init_matrix(double *instance, unsigned int M)
{
    unsigned int seed = 45613 + 19 * omp_get_thread_num();
    for (int row = 0; row < M; row++) {
        for (int column = 0; column <= M; column++) {
            instance[row * (M + 1) + column] = drand(-1.0, 1.0, &seed);
        }
    }
}

void initialize_and_solve(int M)
{
    double *instance;
    instance = malloc(M * (M + 1) * sizeof(double));

    // Initialise the matrix
    init_matrix(instance, M);

    // Performing elementary operations
    int i, j, k = 0, c, flag = 0, m = 0;
    for (i = 0; i < M; i++) {
        if (instance[i * (M + 2)] == 0) {
            c = 1;
            while ((i + c) < M && instance[(i + c) * (M + 1) + i] == 0)
                c++;

            if ((i + c) == M) {
                flag = 1;
                break;
            }

            for (j = i, k = 0; k <= M; k++) {
                swap(&instance[j * (M + 1) + k], &instance[(j + c) * (M + 1) + k]);
            }
        }

        for (j = 0; j < M; j++) {
            // Excluding all i == j
            if (i != j) {
                // Converting Matrix to reduced row
                // echelon form(diagonal matrix)
                double pro = instance[j * (M + 1) + i] / instance[i * (M + 2)];

                for (k = 0; k <= M; k++)
                    instance[j * (M + 1) + k] -= (instance[i * (M + 1) + k]) * pro;
            }
        }
    }

    // Get the solution in the last column
    for (int i = 0; i < M; i++) {
        instance[i * (M + 1) + M] /= instance[i * (M + 2)];
    }

    free(instance);
    instance = NULL;
}

double solve_serial(int N, int M)
{
    double now = omp_get_wtime();
    for (int i = 0; i < N; i++) {
        initialize_and_solve(M);
    }
    return omp_get_wtime() - now;
}

double solve_parallel(int N, int M)
{
    double now = omp_get_wtime();
#pragma omp parallel for
    for (int i = 0; i < N; i++) {
        initialize_and_solve(M);
    }
    return omp_get_wtime() - now;
}

int main(int argc, char **argv)
{
    // Default parameters
    int N = 200, M = 200, mode = 2;

    if (argc == 4) {
        N = atoi(argv[1]);
        M = atoi(argv[2]);
        mode = atoi(argv[3]);
    }

    init_vars(&N, &M, &mode);

    if (mode == 0) {
        // Serial only
        double l2_norm_serial = 0.0;
        double serial = solve_serial(N, M);
        printf("Time, %d, %d, %lf\n", N, M, serial);
    } else if (mode == 1) {
        // Parallel only
        double l2_norm_parallel = 0.0;
        double parallel = solve_parallel(N, M);
        printf("Time, %d, %d, %lf\n", N, M, parallel);
    } else {
        // Both serial and parallel
        // Solve using GEM (serial)
        double serial = solve_serial(N, M);

        // Solve using GEM (parallel)
        double parallel = solve_parallel(N, M);

        printf("Time, %d, %d, %lf, %lf, %lf\n", N, M, serial, parallel, serial / parallel);
    }

    return 0;
}

edit3：改写第一个点，以阐明实际正在做的事情（基于评论中的反馈）

原文

I came across this behavior of speed up and I am finding it hard to explain. Following is the background:

Program

Invocation of Gaussian Elimination method to solve linear equation within a loop to parallelize the work load across compute units. We use an augmented matrix of dimension (M by M+1) where one additional column holds the RHS

HPC Setup - Cray XC50 node with Intel Xeon 6148 Gold with the following configuration

available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
node 0 size: 95325 MB
node 0 free: 93811 MB
node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
node 1 size: 96760 MB
node 1 free: 96374 MB
node distances:
node   0   1
  0:  10  21
  1:  21  10

Although not the actual HPC, but the block diagram and the related explanation seems to fully apply (https://www.nas.nasa.gov/hecc/support/kb/skylake-processors_550.html). Specifically sub NUMA clustering seems to be disabled.

Job submitted through APLS is as follows

time aprun -n 1 -d 20 -j 1 -ss -cc 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 0,1,2,3,4,5,6,7,8,9,20,21,22,23,24,25,26,27,28,29 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 0,1,2,3,4,5,6,7,8,9,30,31,32,33,34,35,36,37,38,39 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 40,41,42,43,44,45,46,47,48,49,60,61,62,63,64,65,66,67,68,69 -e N=4000 -e M=200 -e MODE=2 ./gem

In the above N indicates the number of matrices and M replaces the dimension of the matrix. These are passed as environment variable to the program and used internally. MODE can be ignored for this discussion
cc list specifically lists the CPUs to bind with. OMP_NUM_THREADS is set to 20. The intent is to use 20 threads across 20 compute units.

Time to run sequentially and parallel is recorded within the program using omp_get_wtime() and the results are the following

CPU Binding	Objective	Speed Up
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19	Load work across 20 physical cores on socket 0	13.081944
0,1,2,3,4,5,6,7,8,9,20,21,22,23,24,25,26,27,28,29	Spread across first 10 physical cores on socket 0 & socket 1	18.332559
10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29	Spread across 2nd set of 1o physical cores on socket 0 & first 10 of socket 1	18.636265
40,41,42,43,44,45,46,47,48,49,60,61,62,63,64,65,66,67,68,69	Spread across virtual cores across sockets(40-0, 60-21)	15.922209

Why is the speed up less for the first case when all physical nodes on socket 0 are being used ? The understanding here is that when tasks are spread across sockets, UPI comes into effect and it should be slower whereas it seems to be exactly the opposite. Also what can possibly explain the last scenario when virtual cores are being used.

Note: We have tried multiple iterations and the results for the above combinations are pretty consistent.

Edit1:

Edit2: Source code

#define _GNU_SOURCE

#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include "sched.h"
#include "omp.h"

double drand(double low, double high, unsigned int *seed)
{
    return ((double)rand_r(seed) * (high - low)) / (double)RAND_MAX + low;
}

void init_vars(int *N, int *M, int *mode)
{
    const char *number_of_instances = getenv("N");
    if (number_of_instances) {
        *N = atoi(number_of_instances);
    }

    const char *matrix_dim = getenv("M");
    if (matrix_dim) {
        *M = atoi(matrix_dim);
    }

    const char *running_mode = getenv("MODE");
    if (running_mode) {
        *mode = atoi(running_mode);
    }
}

void print_matrix(double *instance, int M)
{
    for (int row = 0; row < M; row++) {
        for (int column = 0; column <= M; column++) {
            printf("%lf ", instance[row * (M + 1) + column]);
        }
        printf("\n");
    }
    printf("\n");
}

void swap(double *a, double *b)
{
    double temp = *a;
    *a = *b;
    *b = temp;
}

void init_matrix(double *instance, unsigned int M)
{
    unsigned int seed = 45613 + 19 * omp_get_thread_num();
    for (int row = 0; row < M; row++) {
        for (int column = 0; column <= M; column++) {
            instance[row * (M + 1) + column] = drand(-1.0, 1.0, &seed);
        }
    }
}

void initialize_and_solve(int M)
{
    double *instance;
    instance = malloc(M * (M + 1) * sizeof(double));

    // Initialise the matrix
    init_matrix(instance, M);

    // Performing elementary operations
    int i, j, k = 0, c, flag = 0, m = 0;
    for (i = 0; i < M; i++) {
        if (instance[i * (M + 2)] == 0) {
            c = 1;
            while ((i + c) < M && instance[(i + c) * (M + 1) + i] == 0)
                c++;

            if ((i + c) == M) {
                flag = 1;
                break;
            }

            for (j = i, k = 0; k <= M; k++) {
                swap(&instance[j * (M + 1) + k], &instance[(j + c) * (M + 1) + k]);
            }
        }

        for (j = 0; j < M; j++) {
            // Excluding all i == j
            if (i != j) {
                // Converting Matrix to reduced row
                // echelon form(diagonal matrix)
                double pro = instance[j * (M + 1) + i] / instance[i * (M + 2)];

                for (k = 0; k <= M; k++)
                    instance[j * (M + 1) + k] -= (instance[i * (M + 1) + k]) * pro;
            }
        }
    }

    // Get the solution in the last column
    for (int i = 0; i < M; i++) {
        instance[i * (M + 1) + M] /= instance[i * (M + 2)];
    }

    free(instance);
    instance = NULL;
}

double solve_serial(int N, int M)
{
    double now = omp_get_wtime();
    for (int i = 0; i < N; i++) {
        initialize_and_solve(M);
    }
    return omp_get_wtime() - now;
}

double solve_parallel(int N, int M)
{
    double now = omp_get_wtime();
#pragma omp parallel for
    for (int i = 0; i < N; i++) {
        initialize_and_solve(M);
    }
    return omp_get_wtime() - now;
}

int main(int argc, char **argv)
{
    // Default parameters
    int N = 200, M = 200, mode = 2;

    if (argc == 4) {
        N = atoi(argv[1]);
        M = atoi(argv[2]);
        mode = atoi(argv[3]);
    }

    init_vars(&N, &M, &mode);

    if (mode == 0) {
        // Serial only
        double l2_norm_serial = 0.0;
        double serial = solve_serial(N, M);
        printf("Time, %d, %d, %lf\n", N, M, serial);
    } else if (mode == 1) {
        // Parallel only
        double l2_norm_parallel = 0.0;
        double parallel = solve_parallel(N, M);
        printf("Time, %d, %d, %lf\n", N, M, parallel);
    } else {
        // Both serial and parallel
        // Solve using GEM (serial)
        double serial = solve_serial(N, M);

        // Solve using GEM (parallel)
        double parallel = solve_parallel(N, M);

        printf("Time, %d, %d, %lf, %lf, %lf\n", N, M, serial, parallel, serial / parallel);
    }

    return 0;
}

Edit3: Rephrased the first point to clarify what is actually being done ( based on feedback in comment )

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

岁月流歌 2025-02-17 12:05:11

您说您实施了“高斯消除的简单实施”。抱歉，没有这样的事情。有多种不同的算法，它们都有自己的分析。但是，假设您使用教科书。即使那样，高斯消除也不简单。

首先，您尚未说出并行初始化数据。如果您不这样做，所有数据将在插座0上结束，您将获得不良的性能，不要介意加速。但是，假设您在这里做了正确的事情。（如果不是，则Google“初触摸”。）

在GE算法中，每个顺序k迭代都可以在数据的越来越小的子集上工作。这意味着不可能将数据映射到核心。如果将数据放置在最初每个核心在本地数据上工作的方式，则很快就会不再是这种情况。

实际上，经过一半的迭代次数，您的一半核心将从另一个插座中提取数据，从而导致Numa Coohrence延迟。也许在这里绑定比您的compact绑定更好。

回复收藏 0 原文

最好是你 2025-02-17 12:05:11

为什么使用插座0上所有物理节点时的第一种情况下的速度较小？

结果通常取决于应用程序，但某些模式经常发生。我的猜测是，您的应用程序大量使用主RAM和 2个插座会导致使用的DDR4 RAM块仅仅是一个。确实，通过本地NUMA节点分配，1个插座可以以128 GB/s的速度访问RAM，而2个插座可以以256 GB/s的速度访问RAM。通过平衡使用DDR4 RAM块，具有最差的性能并受UPI的界限（由于全双工数据传输，我不希望2个插座会慢得多）。