犰狳 C++ ifft 性能不佳

发布于 2025-01-11 12:20:50 字数 3258 浏览 4 评论 0 原文

我有当前的测试代码，

#include <iostream>
#define ARMA_DONT_USE_WRAPPER
#include <armadillo>

using namespace std::complex_literals;

int main()
{
    arma::cx_mat testMat { };
    testMat.set_size(40, 19586);
    auto nPositions = static_cast<arma::sword>(floor(19586/2));
    arma::cx_rowvec a_vec {19586, arma::fill::randu};
    arma::cx_rowvec b_vec {19586, arma::fill::randu};
    arma::cx_rowvec c_vec {19586, arma::fill::randu};

    for (size_t nCo=0; nCo < 3; nCo++) {
        arma::rowvec d {19586, arma::fill::randu};
        for(size_t iDop = 0; iDop < 40; ++iDop)
        {
            
                arma::cx_rowvec signalFi = (b_vec % arma::exp(-1i*M_PI*a_vec));
                testMat.row(iDop) += arma::ifft(arma::shift(arma::fft(signalFi), nPositions).eval() % c_vec).eval();
            
        }
    }
return 0;
}

我正在尝试执行一些计算。 每次迭代的秒表共享性能约为：300毫秒，这对我的需求来说性能很差。

有人可以解释我做错了什么或一些技巧如何提高性能。

I used .eval() to perform 'eager' evaluation. 
gcc 11.2
armadillo 10.8.2
Release Mode -O3

更新版本。可以重新设计 ifft 函数吗？ 测试代码

#include <iostream>
#include <fftw3.h>
#include <armadillo>
#include "StopWatch.h"

using namespace std;

inline arma::cx_mat ifftshift(arma::cx_mat const &axx)
{
    return arma::shift(axx, -ceil(axx.n_rows/2), 0);
}

void ifft(arma::cx_mat &inMat, arma::cx_mat &outMat)
{
    size_t N = inMat.n_rows;
    size_t n_cols = inMat.n_cols;
    for (size_t index = 0; index < n_cols; ++index)
    {
        fftw_complex *in1  = reinterpret_cast<fftw_complex *>(inMat.colptr(index));
        fftw_complex *out1 = reinterpret_cast<fftw_complex *>(outMat.colptr(index));
        fftw_plan pl_ifft_cx1 = fftw_plan_dft_1d(N, in1, out1, FFTW_BACKWARD, FFTW_ESTIMATE);
        fftw_execute_dft(pl_ifft_cx1, in1, out1);
    }
    outMat /= N;
}

int main()
{
    arma::cx_mat B;

    B << std::complex<double>(+1.225e-01,+8.247e-01) << std::complex<double>(+4.078e-01,+5.632e-01) << std::complex<double>(+8.866e-01,+8.386e-01) << arma::endr
      << std::complex<double>(+5.958e-01,+1.015e-01) << std::complex<double>(+7.857e-01,+4.267e-01) << std::complex<double>(+7.997e-01,+9.176e-01) << arma::endr
      << std::complex<double>(+1.877e-01,+3.378e-01) << std::complex<double>(+2.921e-01,+9.651e-01) << std::complex<double>(+1.056e-01,+6.901e-01) << arma::endr
      << std::complex<double>(+2.322e-01,+6.990e-01) << std::complex<double>(+1.547e-01,+4.256e-01) << std::complex<double>(+9.094e-01,+1.194e-01) << arma::endr
      << std::complex<double>(+3.917e-01,+3.886e-01) << std::complex<double>(+2.166e-01,+4.962e-01) << std::complex<double>(+9.777e-01,+4.464e-01) << arma::endr;

    arma::cx_mat output(5,3);
    arma::cx_mat shifted = ifftshift(B);
    arma::cx_mat arma_result = arma::ifft(shifted);
    B.print("B");
    arma_result.print("arma_result");
    ifft(shifted, output);
    output.print("output");

    return 0;
}

原文

I have current test code

#include <iostream>
#define ARMA_DONT_USE_WRAPPER
#include <armadillo>

using namespace std::complex_literals;

int main()
{
    arma::cx_mat testMat { };
    testMat.set_size(40, 19586);
    auto nPositions = static_cast<arma::sword>(floor(19586/2));
    arma::cx_rowvec a_vec {19586, arma::fill::randu};
    arma::cx_rowvec b_vec {19586, arma::fill::randu};
    arma::cx_rowvec c_vec {19586, arma::fill::randu};

    for (size_t nCo=0; nCo < 3; nCo++) {
        arma::rowvec d {19586, arma::fill::randu};
        for(size_t iDop = 0; iDop < 40; ++iDop)
        {
            
                arma::cx_rowvec signalFi = (b_vec % arma::exp(-1i*M_PI*a_vec));
                testMat.row(iDop) += arma::ifft(arma::shift(arma::fft(signalFi), nPositions).eval() % c_vec).eval();
            
        }
    }
return 0;
}

I am trying to perform some computation.
StopWatch shared performance for each iteration around : 300 ms, which is bad performance for my needs.

Is someone which can explain what i am doing wrong or some tricks how can i increase the performance.

I used .eval() to perform 'eager' evaluation. 
gcc 11.2
armadillo 10.8.2
Release Mode -O3

Updated Version. Is possible to redesign the ifft function ?
Test Code

#include <iostream>
#include <fftw3.h>
#include <armadillo>
#include "StopWatch.h"

using namespace std;

inline arma::cx_mat ifftshift(arma::cx_mat const &axx)
{
    return arma::shift(axx, -ceil(axx.n_rows/2), 0);
}

void ifft(arma::cx_mat &inMat, arma::cx_mat &outMat)
{
    size_t N = inMat.n_rows;
    size_t n_cols = inMat.n_cols;
    for (size_t index = 0; index < n_cols; ++index)
    {
        fftw_complex *in1  = reinterpret_cast<fftw_complex *>(inMat.colptr(index));
        fftw_complex *out1 = reinterpret_cast<fftw_complex *>(outMat.colptr(index));
        fftw_plan pl_ifft_cx1 = fftw_plan_dft_1d(N, in1, out1, FFTW_BACKWARD, FFTW_ESTIMATE);
        fftw_execute_dft(pl_ifft_cx1, in1, out1);
    }
    outMat /= N;
}

int main()
{
    arma::cx_mat B;

    B << std::complex<double>(+1.225e-01,+8.247e-01) << std::complex<double>(+4.078e-01,+5.632e-01) << std::complex<double>(+8.866e-01,+8.386e-01) << arma::endr
      << std::complex<double>(+5.958e-01,+1.015e-01) << std::complex<double>(+7.857e-01,+4.267e-01) << std::complex<double>(+7.997e-01,+9.176e-01) << arma::endr
      << std::complex<double>(+1.877e-01,+3.378e-01) << std::complex<double>(+2.921e-01,+9.651e-01) << std::complex<double>(+1.056e-01,+6.901e-01) << arma::endr
      << std::complex<double>(+2.322e-01,+6.990e-01) << std::complex<double>(+1.547e-01,+4.256e-01) << std::complex<double>(+9.094e-01,+1.194e-01) << arma::endr
      << std::complex<double>(+3.917e-01,+3.886e-01) << std::complex<double>(+2.166e-01,+4.962e-01) << std::complex<double>(+9.777e-01,+4.464e-01) << arma::endr;

    arma::cx_mat output(5,3);
    arma::cx_mat shifted = ifftshift(B);
    arma::cx_mat arma_result = arma::ifft(shifted);
    B.print("B");
    arma_result.print("arma_result");
    ifft(shifted, output);
    output.print("output");

    return 0;
}

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

薄暮涼年 2025-01-18 12:20:50

我刚刚用我自己的库尝试了类似的操作，根据我的测量，您认为循环的每次迭代不应超过 1 毫秒（而不是 300 毫秒），这是正确的。

这是等效的代码，抱歉，这不是犰狳答案，我只是指出最小化操作和分配的具体目标是什么。

#include<multi/adaptors/fftw.hpp>
#include<multi/array.hpp>

namespace fftw = multi::fftw;

int main() {
    multi::array<std::complex<double>, 1> const arr = n_random_complex<double>(19586);
    multi::array<std::complex<double>, 1>       res(arr.extensions());  // output allocated only once                  

    fftw::plan fdft{arr, res, fftw::forward};  // fftw plan and internal buffers allocated only once

    auto const N = 40;
    for(int i = 0; i != N; ++i) {  // each iteration takes ~1ms in an intel-i7
        fdft(arr.base(), res.base());  // fft operation with precalculated plan
        std::rotate(res.begin(), res.begin() + res.size()/2, res.end());  // rotation (shift on size/2) done in place, no allocation either
    }
}

完整的代码和库位于： https://gitlab.com/correaa/boost-multi/-/blob/master/adaptors/fftw/test/shift.cpp#L45-58（额外的代码用于计时测量）。

同样能说明问题的是，我试图犯所有可能的错误来悲观代码。
为了尝试模仿我认为犰狳所做的“错误”......在循环内分配并始终进行复制。但我得到的是每次迭代需要 1.5 毫秒。

我的结论是，您的犰狳用法或库本身存在严重错误。

    multi::array<std::complex<double>, 1> const arr = n_random_complex<double>(19586);  BOOST_REQUIRE(arr.size() == 19586);

    auto const N = 40;
    for(int i = 0; i != N; ++i) {
        multi::array<std::complex<double>, 1>       res(arr.extensions(), 0.);
        fftw::plan fdft{arr, res, fftw::forward};
        fdft(arr.base(), res.base());
        multi::array<std::complex<double>, 1>       res_copy(arr.extensions(), 0.);
        std::rotate_copy(res.begin(), res.begin() + res.size()/2, res.end(), res_copy.begin());
    }

I just tried a similar operation with my own library and, according to my measurements, you are correct that each iteration of the loop shouldn't take more than 1 millisecond (instead of 300 ms).

This is the equivalent code, sorry that this is not an Armadillo answer, I am just pointing out what are the concrete goals for minimizing operations and allocations.

#include<multi/adaptors/fftw.hpp>
#include<multi/array.hpp>

namespace fftw = multi::fftw;

int main() {
    multi::array<std::complex<double>, 1> const arr = n_random_complex<double>(19586);
    multi::array<std::complex<double>, 1>       res(arr.extensions());  // output allocated only once                  

    fftw::plan fdft{arr, res, fftw::forward};  // fftw plan and internal buffers allocated only once

    auto const N = 40;
    for(int i = 0; i != N; ++i) {  // each iteration takes ~1ms in an intel-i7
        fdft(arr.base(), res.base());  // fft operation with precalculated plan
        std::rotate(res.begin(), res.begin() + res.size()/2, res.end());  // rotation (shift on size/2) done in place, no allocation either
    }
}

The full code and library is here: https://gitlab.com/correaa/boost-multi/-/blob/master/adaptors/fftw/test/shift.cpp#L45-58 (the extra code is for the timing measurement).

What is also telling is that I tried to do all the possible mistakes to pessimize the code.
To try to mimic what I think Armadillo is doing "wrong"... allocating inside the loop and making copies all the time. But what I get is that each iteration take 1.5 milliseconds.

My conclusion is that something is terribly wrong in your Armadillo usage or in the library itself.

    multi::array<std::complex<double>, 1> const arr = n_random_complex<double>(19586);  BOOST_REQUIRE(arr.size() == 19586);

    auto const N = 40;
    for(int i = 0; i != N; ++i) {
        multi::array<std::complex<double>, 1>       res(arr.extensions(), 0.);
        fftw::plan fdft{arr, res, fftw::forward};
        fdft(arr.base(), res.base());
        multi::array<std::complex<double>, 1>       res_copy(arr.extensions(), 0.);
        std::rotate_copy(res.begin(), res.begin() + res.size()/2, res.end(), res_copy.begin());
    }

回复收藏 0 原文

~没有更多了~