为什么 valarray 这么慢？

发布于 2024-11-26 19:26:40 字数 3698 浏览 2 评论 0原文

我正在尝试使用 valarray，因为它在操作向量和矩阵时非常类似于 MATLAB。我首先做了一些性能检查，发现 valarray 无法达到书中声明的性能 C++ 编程语言，作者：Stroustrup。

该测试程序实际上执行了 500 万次双精度乘法。我认为 c = a*b 至少可以与 for 循环双类型元素乘法相媲美，但我完全错了。我在几台计算机和 Microsoft Visual C++ 6.0 和 Visual Studio 2008 上进行了尝试。

顺便说一下，我使用以下代码在 MATLAB 上进行了测试：

len = 5*1024*1024;
a = rand(len, 1);
b = rand(len, 1);
c = zeros(len, 1);
tic;
c = a.*b;
toc;

结果是 46 ms。这个时间精度不高；它只能作为参考。

代码为：

#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"

using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;

double gettime_hp();

int main()
{
    enum { N = 5*1024*1024 };
    valarray<double> a(N), b(N), c(N);
    QueryPerformanceFrequency(&sys_freq);
    int i, j;
    for (j=0 ; j<8 ; ++j)
    {
        for (i=0 ; i<N ; ++i)
        {
            a[i] = rand();
            b[i] = rand();
        }

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
        double dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c1[i] = a1[i] * b1[i];
        dtime = gettime_hp()-dtime;
        cout << "double operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        c = a*b ;
        dtime = gettime_hp() - dtime;
        cout << "valarray operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c[i] = a[i] * b[i];
        dtime = gettime_hp() - dtime;
        cout << "valarray[i] operator* " << dtime<< " ms\n";

        cout << "------------------------------------------------------\n";
    }
}

double gettime_hp()
{
    LARGE_INTEGER tick;
    extern LARGE_INTEGER sys_freq;
    QueryPerformanceCounter(&tick);
    return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
}

运行结果：（最大速度优化的发布模式）

double operator* 52.3019 ms
valarray operator* 128.338 ms
valarray[i] operator* 43.1801 ms
------------------------------------------------------
double operator* 43.4036 ms
valarray operator* 145.533 ms
valarray[i] operator* 44.9121 ms
------------------------------------------------------
double operator* 43.2619 ms
valarray operator* 158.681 ms
valarray[i] operator* 43.4871 ms
------------------------------------------------------
double operator* 42.7317 ms
valarray operator* 173.164 ms
valarray[i] operator* 80.1004 ms
------------------------------------------------------
double operator* 43.2236 ms
valarray operator* 158.004 ms
valarray[i] operator* 44.3813 ms
------------------------------------------------------

同等优化的调试模式：

double operator* 41.8123 ms
valarray operator* 201.484 ms
valarray[i] operator* 41.5452 ms
------------------------------------------------------
double operator* 40.2238 ms
valarray operator* 215.351 ms
valarray[i] operator* 40.2076 ms
------------------------------------------------------
double operator* 40.5859 ms
valarray operator* 232.007 ms
valarray[i] operator* 40.8803 ms
------------------------------------------------------
double operator* 40.9734 ms
valarray operator* 234.325 ms
valarray[i] operator* 40.9711 ms
------------------------------------------------------
double operator* 41.1977 ms
valarray operator* 234.409 ms
valarray[i] operator* 41.1429 ms
------------------------------------------------------
double operator* 39.7754 ms
valarray operator* 234.26 ms
valarray[i] operator* 39.6338 ms
------------------------------------------------------

原文

I am trying to use valarray since it is much like MATLAB while operating vector and matrices. I first did some performance check and found that valarray cannot achieve the performance declared as in the book C++ programming language by Stroustrup.

The test program actually did 5 million multiplication of doubles. I thought that c = a*b would at least be comparable to the for loop double type element multiplication, but I am totally wrong. I tried on several computers and Microsoft Visual C++ 6.0 and Visual Studio 2008.

By the way, I tested on MATLAB using the following code:

len = 5*1024*1024;
a = rand(len, 1);
b = rand(len, 1);
c = zeros(len, 1);
tic;
c = a.*b;
toc;

And the result is 46 ms. This time is not high precision; it only works as a reference.

The code is:

#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"

using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;

double gettime_hp();

int main()
{
    enum { N = 5*1024*1024 };
    valarray<double> a(N), b(N), c(N);
    QueryPerformanceFrequency(&sys_freq);
    int i, j;
    for (j=0 ; j<8 ; ++j)
    {
        for (i=0 ; i<N ; ++i)
        {
            a[i] = rand();
            b[i] = rand();
        }

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
        double dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c1[i] = a1[i] * b1[i];
        dtime = gettime_hp()-dtime;
        cout << "double operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        c = a*b ;
        dtime = gettime_hp() - dtime;
        cout << "valarray operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c[i] = a[i] * b[i];
        dtime = gettime_hp() - dtime;
        cout << "valarray[i] operator* " << dtime<< " ms\n";

        cout << "------------------------------------------------------\n";
    }
}

double gettime_hp()
{
    LARGE_INTEGER tick;
    extern LARGE_INTEGER sys_freq;
    QueryPerformanceCounter(&tick);
    return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
}

The running results: (release mode with maximal speed optimization)

double operator* 52.3019 ms
valarray operator* 128.338 ms
valarray[i] operator* 43.1801 ms
------------------------------------------------------
double operator* 43.4036 ms
valarray operator* 145.533 ms
valarray[i] operator* 44.9121 ms
------------------------------------------------------
double operator* 43.2619 ms
valarray operator* 158.681 ms
valarray[i] operator* 43.4871 ms
------------------------------------------------------
double operator* 42.7317 ms
valarray operator* 173.164 ms
valarray[i] operator* 80.1004 ms
------------------------------------------------------
double operator* 43.2236 ms
valarray operator* 158.004 ms
valarray[i] operator* 44.3813 ms
------------------------------------------------------

Debugging mode with same optimization:

double operator* 41.8123 ms
valarray operator* 201.484 ms
valarray[i] operator* 41.5452 ms
------------------------------------------------------
double operator* 40.2238 ms
valarray operator* 215.351 ms
valarray[i] operator* 40.2076 ms
------------------------------------------------------
double operator* 40.5859 ms
valarray operator* 232.007 ms
valarray[i] operator* 40.8803 ms
------------------------------------------------------
double operator* 40.9734 ms
valarray operator* 234.325 ms
valarray[i] operator* 40.9711 ms
------------------------------------------------------
double operator* 41.1977 ms
valarray operator* 234.409 ms
valarray[i] operator* 41.1429 ms
------------------------------------------------------
double operator* 39.7754 ms
valarray operator* 234.26 ms
valarray[i] operator* 39.6338 ms
------------------------------------------------------

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

掌心的温暖 2024-12-03 19:26:40

我刚刚在 Linux x86-64 系统（Sandy Bridge CPU）上尝试过：

gcc 4.5.0:

double operator* 9.64185 ms
valarray operator* 9.36987 ms
valarray[i] operator* 9.35815 ms

Intel ICC 12.0.2:

double operator* 7.76757 ms
valarray operator* 9.60208 ms
valarray[i] operator* 7.51409 ms

在这两种情况下我都只使用了 -O3 而没有其他与优化相关的标志。

看起来 MS C++ 编译器和/或 valarray 实现很糟糕。

这是针对 Linux 修改的 OP 代码：

#include <iostream>
#include <valarray>
#include <iostream>
#include <ctime>

using namespace std ;

double gettime_hp();

int main()
{
    enum { N = 5*1024*1024 };
    valarray<double> a(N), b(N), c(N) ;
    int i,j;
    for(  j=0 ; j<8 ; ++j )
    {
        for(  i=0 ; i<N ; ++i )
        {
            a[i]=rand();
            b[i]=rand();
        }

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;
        double dtime=gettime_hp();
        for(  i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;
        dtime=gettime_hp()-dtime;
        cout << "double operator* " << dtime << " ms\n" ;

        dtime=gettime_hp();
        c = a*b ;
        dtime=gettime_hp()-dtime;
        cout << "valarray operator* " << dtime << " ms\n" ;

        dtime=gettime_hp();
        for(  i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;
        dtime=gettime_hp()-dtime;
        cout << "valarray[i] operator* " << dtime<< " ms\n" ;

        cout << "------------------------------------------------------\n" ;
    }
}

double gettime_hp()
{
    struct timespec timestamp;

    clock_gettime(CLOCK_REALTIME, ×tamp);
    return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
}

I just tried it on a Linux x86-64 system (Sandy Bridge CPU):

gcc 4.5.0:

double operator* 9.64185 ms
valarray operator* 9.36987 ms
valarray[i] operator* 9.35815 ms

Intel ICC 12.0.2:

double operator* 7.76757 ms
valarray operator* 9.60208 ms
valarray[i] operator* 7.51409 ms

In both cases I just used -O3 and no other optimisation-related flags.

It looks like the MS C++ compiler and/or valarray implementation suck.

Here's the OP's code modified for Linux:

#include <iostream>
#include <valarray>
#include <iostream>
#include <ctime>

using namespace std ;

double gettime_hp();

int main()
{
    enum { N = 5*1024*1024 };
    valarray<double> a(N), b(N), c(N) ;
    int i,j;
    for(  j=0 ; j<8 ; ++j )
    {
        for(  i=0 ; i<N ; ++i )
        {
            a[i]=rand();
            b[i]=rand();
        }

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;
        double dtime=gettime_hp();
        for(  i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;
        dtime=gettime_hp()-dtime;
        cout << "double operator* " << dtime << " ms\n" ;

        dtime=gettime_hp();
        c = a*b ;
        dtime=gettime_hp()-dtime;
        cout << "valarray operator* " << dtime << " ms\n" ;

        dtime=gettime_hp();
        for(  i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;
        dtime=gettime_hp()-dtime;
        cout << "valarray[i] operator* " << dtime<< " ms\n" ;

        cout << "------------------------------------------------------\n" ;
    }
}

double gettime_hp()
{
    struct timespec timestamp;

    clock_gettime(CLOCK_REALTIME, ×tamp);
    return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
}

回复收藏 0 原文

多彩岁月 2024-12-03 19:26:40

我怀疑 c = a*b 比一次执行一个元素的操作慢得多的原因是

template<class T> valarray<T> operator*
    (const valarray<T>&, const valarray<T>&);

操作符必须分配内存来放入结果，然后按值返回。

即使使用“交换”来执行复制，该函数仍然具有

为生成的 valarray 初始化新 valarray 分配新块
的开销（有可能这可能会被优化掉）
将结果放入新的 valarray
内存中，在新的 valarray 初始化或设置结果值时对新的 valarray 进行分页，
取消分配旧的 valarray 得到替换为结果

I suspect that the reason c = a*b is so much slower than performing the operations an element at a time is that the

template<class T> valarray<T> operator*
    (const valarray<T>&, const valarray<T>&);

operator must allocate memory to put the result into, then returns that by value.

Even if a "swaptimization" is used to perform the copy, that function still has the overhead of

allocating the new block for the resulting valarray
initializing the new valarray (it's possible that this might be optimized away)
putting the results into the new valarray
paging in the memory for the new valarray as it is initialized or set with result values
deallocating the old valarray that gets replaced by the result

回复收藏 0 原文

不疑不惑不回忆 2024-12-03 19:26:40

valarray 的全部要点是在向量机上速度快，而 x86 机器则不然。

非向量机上的良好实现应该能够与您使用类似的东西获得的性能相匹配

for (i=0; i < N; ++i) 
    c1[i] = a1[i] * b1[i];

，而糟糕的实现当然不会。除非硬件中有某种东西可以加速并行处理，否则这将非常接近您能做到的最好结果。

The whole point of valarray is to be fast on vector machines, which x86 machines just aren't.

A good implementation on a nonvector machine should be able to match the performance that you get with something like

for (i=0; i < N; ++i) 
    c1[i] = a1[i] * b1[i];

and a bad one of course won't. Unless there is something in the hardware to expedite parallel processing, that is going to be pretty close to the best that you can do.

回复收藏 0 原文

久伴你 2024-12-03 19:26:40

我终于通过使用延迟评估得到了这个。代码可能很难看，因为我刚刚开始学习这些 C++ 高级概念。

代码如下：

#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"

using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;

double gettime_hp();

// To improve the c = a*b (it will generate a temporary first, assigned to 'c' and delete the temporary.
// Which causes the program really slow
// The solution is the expression template and let the compiler to decide when all the expression is known.


// Delayed evaluation
//typedef valarray<double> Vector;
class Vector;

class VecMul
{
    public:
        const Vector& va;
        const Vector& vb;
        //Vector& vc;
        VecMul(const Vector& v1, const Vector& v2): va(v1), vb(v2) {}
        operator Vector();
};

class Vector:public valarray<double>
{
    valarray<double> *p;

    public:
        explicit Vector(int n)
        {
            p = new valarray<double>(n);
        }
        Vector& operator = (const VecMul &m)
        {
            for(int i=0; i<m.va.size(); i++)
                (*p)[i] = (m.va)[i]*(m.vb)[i]; // Ambiguous
            return *this;
        }
        double& operator[](int i) const {return (*p)[i];} //const vector_type[i]
        int size()const {return (*p).size();}
};


inline VecMul operator*(const Vector& v1, const Vector& v2)
{
    return VecMul(v1, v2);
}


int main()
{
    enum {N = 5*1024*1024};
    Vector a(N), b(N), c(N);
    QueryPerformanceFrequency(&sys_freq);
    int i, j;
    for (j=0 ; j<8 ; ++j)
    {
        for (i=0 ; i<N ; ++i)
        {
            a[i] = rand();
            b[i] = rand();
        }

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
        double dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c1[i] = a1[i] * b1[i];
        dtime = gettime_hp()-dtime;
        cout << "double operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        c = a*b;
        dtime = gettime_hp()-dtime;
        cout << "valarray operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c[i] = a[i] * b[i];
        dtime = gettime_hp() - dtime;
        cout << "valarray[i] operator* " << dtime << " ms\n";

        cout << "------------------------------------------------------\n";
    }
}

double gettime_hp()
{
    LARGE_INTEGER tick;
    extern LARGE_INTEGER sys_freq;
    QueryPerformanceCounter(&tick);
    return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;
}

在Visual Studio上运行结果为：

double operator* 41.2031 ms
valarray operator* 43.8407 ms
valarray[i] operator* 42.49 ms

I finally got this through using delayed evaluation. The code may be ugly since I am just starting learning these C++ advanced concepts.

Here is the code:

#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"

using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;

double gettime_hp();

// To improve the c = a*b (it will generate a temporary first, assigned to 'c' and delete the temporary.
// Which causes the program really slow
// The solution is the expression template and let the compiler to decide when all the expression is known.


// Delayed evaluation
//typedef valarray<double> Vector;
class Vector;

class VecMul
{
    public:
        const Vector& va;
        const Vector& vb;
        //Vector& vc;
        VecMul(const Vector& v1, const Vector& v2): va(v1), vb(v2) {}
        operator Vector();
};

class Vector:public valarray<double>
{
    valarray<double> *p;

    public:
        explicit Vector(int n)
        {
            p = new valarray<double>(n);
        }
        Vector& operator = (const VecMul &m)
        {
            for(int i=0; i<m.va.size(); i++)
                (*p)[i] = (m.va)[i]*(m.vb)[i]; // Ambiguous
            return *this;
        }
        double& operator[](int i) const {return (*p)[i];} //const vector_type[i]
        int size()const {return (*p).size();}
};


inline VecMul operator*(const Vector& v1, const Vector& v2)
{
    return VecMul(v1, v2);
}


int main()
{
    enum {N = 5*1024*1024};
    Vector a(N), b(N), c(N);
    QueryPerformanceFrequency(&sys_freq);
    int i, j;
    for (j=0 ; j<8 ; ++j)
    {
        for (i=0 ; i<N ; ++i)
        {
            a[i] = rand();
            b[i] = rand();
        }

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
        double dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c1[i] = a1[i] * b1[i];
        dtime = gettime_hp()-dtime;
        cout << "double operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        c = a*b;
        dtime = gettime_hp()-dtime;
        cout << "valarray operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c[i] = a[i] * b[i];
        dtime = gettime_hp() - dtime;
        cout << "valarray[i] operator* " << dtime << " ms\n";

        cout << "------------------------------------------------------\n";
    }
}

double gettime_hp()
{
    LARGE_INTEGER tick;
    extern LARGE_INTEGER sys_freq;
    QueryPerformanceCounter(&tick);
    return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;
}

The running result on Visual studio is:

double operator* 41.2031 ms
valarray operator* 43.8407 ms
valarray[i] operator* 42.49 ms

回复收藏 0 原文

才能让你更想念 2024-12-03 19:26:40

我正在 x64 版本、Visual Studio 2010 中进行编译。我对您的代码进行了轻微的更改：

    double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
    double dtime = gettime_hp();
    for (i=0 ; i<N ; ++i)
        a1[i] *= b1[i];
    dtime = gettime_hp() - dtime;
    cout << "double operator* " << dtime << " ms\n";

    dtime = gettime_hp();
    a *= b;
    dtime = gettime_hp() - dtime;
    cout << "valarray operator* " << dtime << " ms\n";

    dtime = gettime_hp();
    for (i=0 ; i<N ; ++i)
        a[i] *= b[i];
    dtime = gettime_hp() - dtime;
    cout << "valarray[i] operator* " << dtime<< " ms\n";

    cout << "------------------------------------------------------\n" ;

在这里您可以看到我使用了 *= 而不是 c = a * b。在更现代的数学库中，使用非常复杂的表达式模板机制来消除这个问题。在这种情况下，我实际上从 valarray 获得了稍微快一点的结果，尽管这可能只是因为内容已经在缓存中。您看到的开销只是多余的临时变量，而不是 valarray 固有的内容，具体来说，您会看到与 std::string 类似的行为。

I'm compiling in release x64, Visual Studio 2010. I changed your code very slightly:

    double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
    double dtime = gettime_hp();
    for (i=0 ; i<N ; ++i)
        a1[i] *= b1[i];
    dtime = gettime_hp() - dtime;
    cout << "double operator* " << dtime << " ms\n";

    dtime = gettime_hp();
    a *= b;
    dtime = gettime_hp() - dtime;
    cout << "valarray operator* " << dtime << " ms\n";

    dtime = gettime_hp();
    for (i=0 ; i<N ; ++i)
        a[i] *= b[i];
    dtime = gettime_hp() - dtime;
    cout << "valarray[i] operator* " << dtime<< " ms\n";

    cout << "------------------------------------------------------\n" ;

Here you can see that I used *= instead of c = a * b. In more modern mathematical libraries, very complex expression template mechanisms are used that eliminate this problem. In this case, I actually got very slightly faster results from valarray, although that's probably just because the contents were already in a cache. The overhead that you are seeing is simply redundant temporaries and nothing intrinsic to valarray, specifically- you'd see the same behaviour with something like std::string.

回复收藏 0 原文

夏末染殇 2024-12-03 19:26:40

我认为迈克尔·伯尔的回答是正确的。也许您可以创建一个虚拟类型作为运算符+的返回值类型，并为该虚拟类型重新加载另一个operator=，例如operator=(virtual type& ; v){&valarray=&v;v=NULL;} （粗略地说）。

当然，这个想法在valarray上实现起来有一定难度。但是当你创建一个新类时，你可以尝试这个想法。然后，operator+ 的效率几乎与 operator+= 相同。

回复收藏 0 原文

℉服软 2024-12-03 19:26:40

嗯..我测试了 Blitz++ ，它与 valarray 相同......而且，Blitz++ [] 运算符非常慢。

#include <blitz/array.h>
#include <iostream>

#ifdef WIN32
#include "windows.h"
LARGE_INTEGER sys_freq;
#endif

#ifdef LINUX
<ctime>
#endif

using namespace std;
SYSTEMTIME stime;

__forceinline double gettime_hp();
double gettime_hp()
{
    #ifdef WIN32
        LARGE_INTEGER tick;
        extern LARGE_INTEGER sys_freq;
        QueryPerformanceCounter(&tick);
        return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
    #endif

    #ifdef LINUX
        struct timespec timestamp;

        clock_gettime(CLOCK_REALTIME, ×tamp);
        return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
    #endif
}
BZ_USING_NAMESPACE(blitz)

int main()
{
    int N = 5*1024*1024;

    // Create three-dimensional arrays of double
    Array<double, 1> a(N), b(N), c(N);

    int i, j;

    #ifdef WIN32
        QueryPerformanceFrequency(&sys_freq);
    #endif

    for (j=0 ; j<8 ; ++j)
    {
        for (i=0 ; i<N ; ++i)
        {
            a[i] = rand();
            b[i] = rand();
        }

        double* a1 = a.data(), *b1 = b.data(), *c1 = c.data();
        double dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c1[i] = a1[i] * b1[i];
        dtime = gettime_hp() - dtime;
        cout << "double operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        c = a*b;
        dtime = gettime_hp() - dtime;
        cout << "blitz operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c[i] = a[i] * b[i];
        dtime = gettime_hp() - dtime;
        cout << "blitz[i] operator* " << dtime<< " ms\n";

        cout << "------------------------------------------------------\n";
    }
}

Hmm..I tested Blitz++ and it's same as valarray... And moreover, the Blitz++ [] operator is very slow.

#include <blitz/array.h>
#include <iostream>

#ifdef WIN32
#include "windows.h"
LARGE_INTEGER sys_freq;
#endif

#ifdef LINUX
<ctime>
#endif

using namespace std;
SYSTEMTIME stime;

__forceinline double gettime_hp();
double gettime_hp()
{
    #ifdef WIN32
        LARGE_INTEGER tick;
        extern LARGE_INTEGER sys_freq;
        QueryPerformanceCounter(&tick);
        return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
    #endif

    #ifdef LINUX
        struct timespec timestamp;

        clock_gettime(CLOCK_REALTIME, ×tamp);
        return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
    #endif
}
BZ_USING_NAMESPACE(blitz)

int main()
{
    int N = 5*1024*1024;

    // Create three-dimensional arrays of double
    Array<double, 1> a(N), b(N), c(N);

    int i, j;

    #ifdef WIN32
        QueryPerformanceFrequency(&sys_freq);
    #endif

    for (j=0 ; j<8 ; ++j)
    {
        for (i=0 ; i<N ; ++i)
        {
            a[i] = rand();
            b[i] = rand();
        }

        double* a1 = a.data(), *b1 = b.data(), *c1 = c.data();
        double dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c1[i] = a1[i] * b1[i];
        dtime = gettime_hp() - dtime;
        cout << "double operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        c = a*b;
        dtime = gettime_hp() - dtime;
        cout << "blitz operator* " << dtime << " ms\n";

        dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            c[i] = a[i] * b[i];
        dtime = gettime_hp() - dtime;
        cout << "blitz[i] operator* " << dtime<< " ms\n";

        cout << "------------------------------------------------------\n";
    }
}

回复收藏 0 原文

~没有更多了~