OpenMP增加线程数量增加了执行时间

发布于 2025-01-27 13:09:40 字数 1637 浏览 1 评论 0原文

我正在实现稀疏矩阵乘法（元素类型std :: complex）之后，将它们转换为csr（压缩稀疏行）格式，我正在为此使用OpenMP但是我注意到，增加线程的数量并不一定会提高性能，有时完全恰恰相反！为什么这样？我该怎么办来解决这个问题？

typedef std::vector < std::vector < std::complex < int >>> matrix;

struct CSR {
    std::vector<std::complex<int>> values; //non-zero values
    std::vector<int> row_ptr; //pointers of rows
    std::vector<int> cols_index; //indices of columns
    int rows; //number of rows
    int cols; //number of columns
    int NNZ; //number of non_zero elements
};

const matrix multiply_omp (const CSR& A,
    const CSR& B,const unsigned int num_threds=4) {
    if (A.cols != B.rows)
        throw "Error";
    CSR B_t = sparse_transpose(B);
    omp_set_num_threads(num_threds);
    matrix result(A.rows, std::vector < std::complex < int >>(B.cols, 0));
    #pragma omp parallel
    {
        int i, j, k, l;
        #pragma omp for
        for (i = 0; i < A.rows; i++) {
            for (j = 0; j < B_t.rows; j++) {
                std::complex < int > sum(0, 0);
                for (k = A.row_ptr[i]; k < A.row_ptr[i + 1]; k++)
                    for (l = B_t.row_ptr[j]; l < B_t.row_ptr[j + 1]; l++)
                        if (A.cols_index[k] == B_t.cols_index[l]) {
                            sum += A.values[k] * B_t.values[l];
                            break;
                        }
                if (sum != std::complex < int >(0, 0)) {
                    result[i][j] += sum;
                }
            }
        }
    }
    return result;
}

原文

I'm implementing sparse matrices multiplication(type of elements std::complex) after converting them to CSR(compressed sparse row) format and I'm using openmp for this, but what I noticed that increasing the number of threads doesn't necessarily increase the performance, sometimes is totally the opposite! why is that the case? and what can I do to solve the issue?

typedef std::vector < std::vector < std::complex < int >>> matrix;

struct CSR {
    std::vector<std::complex<int>> values; //non-zero values
    std::vector<int> row_ptr; //pointers of rows
    std::vector<int> cols_index; //indices of columns
    int rows; //number of rows
    int cols; //number of columns
    int NNZ; //number of non_zero elements
};

const matrix multiply_omp (const CSR& A,
    const CSR& B,const unsigned int num_threds=4) {
    if (A.cols != B.rows)
        throw "Error";
    CSR B_t = sparse_transpose(B);
    omp_set_num_threads(num_threds);
    matrix result(A.rows, std::vector < std::complex < int >>(B.cols, 0));
    #pragma omp parallel
    {
        int i, j, k, l;
        #pragma omp for
        for (i = 0; i < A.rows; i++) {
            for (j = 0; j < B_t.rows; j++) {
                std::complex < int > sum(0, 0);
                for (k = A.row_ptr[i]; k < A.row_ptr[i + 1]; k++)
                    for (l = B_t.row_ptr[j]; l < B_t.row_ptr[j + 1]; l++)
                        if (A.cols_index[k] == B_t.cols_index[l]) {
                            sum += A.values[k] * B_t.values[l];
                            break;
                        }
                if (sum != std::complex < int >(0, 0)) {
                    result[i][j] += sum;
                }
            }
        }
    }
    return result;
}

分享到QQ

分享到微博