D 与 C++ 相比有多快?

发布于 2024-10-19 16:56:13 字数 3635 浏览 3 评论 0原文

我喜欢 D 的一些功能,但如果它们附带 运行时惩罚?

为了进行比较,我实现了一个简单的程序,该程序在 C++ 和 D 中计算许多短向量的标量积。结果令人惊讶:

  • D:18.9 s [最终运行时间见下文]
  • C ++:3.8 s

C ++真的几乎是 D 的五倍吗?快还是我在D中犯了一个错误 程序?

我在一个中等的最新 Linux 桌面上用 g++ -O3 (gcc-snapshot 2011-02-19) 编译了 C++,用 dmd -O (dmd 2.052) 编译了 D。结果在多次运行中是可重复的,并且标准偏差可以忽略不计。

这里是 C++ 程序:

#include <iostream>
#include <random>
#include <chrono>
#include <string>

#include <vector>
#include <array>

typedef std::chrono::duration<long, std::ratio<1, 1000>> millisecs;
template <typename _T>
long time_since(std::chrono::time_point<_T>& time) {
      long tm = std::chrono::duration_cast<millisecs>( std::chrono::system_clock::now() - time).count();
  time = std::chrono::system_clock::now();
  return tm;
}

const long N = 20000;
const int size = 10;

typedef int value_type;
typedef long long result_type;
typedef std::vector<value_type> vector_t;
typedef typename vector_t::size_type size_type;

inline value_type scalar_product(const vector_t& x, const vector_t& y) {
  value_type res = 0;
  size_type siz = x.size();
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {
  auto tm_before = std::chrono::system_clock::now();

  // 1. allocate and fill randomly many short vectors
  vector_t* xs = new vector_t [N];
  for (int i = 0; i < N; ++i) {
    xs[i] = vector_t(size);
      }
  std::cerr << "allocation: " << time_since(tm_before) << " ms" << std::endl;

  std::mt19937 rnd_engine;
  std::uniform_int_distribution<value_type> runif_gen(-1000, 1000);
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = runif_gen(rnd_engine);
  std::cerr << "random generation: " << time_since(tm_before) << " ms" << std::endl;

  // 2. compute all pairwise scalar products:
  time_since(tm_before);
  result_type avg = 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j) 
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  auto time = time_since(tm_before);
  std::cout << "result: " << avg << std::endl;
  std::cout << "time: " << time << " ms" << std::endl;
}

这里是 D 版本:

import std.stdio;
import std.datetime;
import std.random;

const long N = 20000;
const int size = 10;

alias int value_type;
alias long result_type;
alias value_type[] vector_t;
alias uint size_type;

value_type scalar_product(const ref vector_t x, const ref vector_t y) {
  value_type res = 0;
  size_type siz = x.length;
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {   
  auto tm_before = Clock.currTime();

  // 1. allocate and fill randomly many short vectors
  vector_t[] xs;
  xs.length = N;
  for (int i = 0; i < N; ++i) {
    xs[i].length = size;
  }
  writefln("allocation: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = uniform(-1000, 1000);
  writefln("random: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  // 2. compute all pairwise scalar products:
  result_type avg = cast(result_type) 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j) 
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  writefln("result: %d", avg);
  auto time = Clock.currTime() - tm_before;
  writefln("scalar products: %i ", time);

  return 0;
}

I like some features of D, but would be interested if they come with a
runtime penalty?

To compare, I implemented a simple program that computes scalar products of many short vectors both in C++ and in D. The result is surprising:

  • D: 18.9 s [see below for final runtime]
  • C++: 3.8 s

Is C++ really almost five times as fast or did I make a mistake in the D
program?

I compiled C++ with g++ -O3 (gcc-snapshot 2011-02-19) and D with dmd -O (dmd 2.052) on a moderate recent linux desktop. The results are reproducible over several runs and standard deviations negligible.

Here the C++ program:

#include <iostream>
#include <random>
#include <chrono>
#include <string>

#include <vector>
#include <array>

typedef std::chrono::duration<long, std::ratio<1, 1000>> millisecs;
template <typename _T>
long time_since(std::chrono::time_point<_T>& time) {
      long tm = std::chrono::duration_cast<millisecs>( std::chrono::system_clock::now() - time).count();
  time = std::chrono::system_clock::now();
  return tm;
}

const long N = 20000;
const int size = 10;

typedef int value_type;
typedef long long result_type;
typedef std::vector<value_type> vector_t;
typedef typename vector_t::size_type size_type;

inline value_type scalar_product(const vector_t& x, const vector_t& y) {
  value_type res = 0;
  size_type siz = x.size();
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {
  auto tm_before = std::chrono::system_clock::now();

  // 1. allocate and fill randomly many short vectors
  vector_t* xs = new vector_t [N];
  for (int i = 0; i < N; ++i) {
    xs[i] = vector_t(size);
      }
  std::cerr << "allocation: " << time_since(tm_before) << " ms" << std::endl;

  std::mt19937 rnd_engine;
  std::uniform_int_distribution<value_type> runif_gen(-1000, 1000);
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = runif_gen(rnd_engine);
  std::cerr << "random generation: " << time_since(tm_before) << " ms" << std::endl;

  // 2. compute all pairwise scalar products:
  time_since(tm_before);
  result_type avg = 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j) 
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  auto time = time_since(tm_before);
  std::cout << "result: " << avg << std::endl;
  std::cout << "time: " << time << " ms" << std::endl;
}

And here the D version:

import std.stdio;
import std.datetime;
import std.random;

const long N = 20000;
const int size = 10;

alias int value_type;
alias long result_type;
alias value_type[] vector_t;
alias uint size_type;

value_type scalar_product(const ref vector_t x, const ref vector_t y) {
  value_type res = 0;
  size_type siz = x.length;
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {   
  auto tm_before = Clock.currTime();

  // 1. allocate and fill randomly many short vectors
  vector_t[] xs;
  xs.length = N;
  for (int i = 0; i < N; ++i) {
    xs[i].length = size;
  }
  writefln("allocation: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = uniform(-1000, 1000);
  writefln("random: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  // 2. compute all pairwise scalar products:
  result_type avg = cast(result_type) 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j) 
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  writefln("result: %d", avg);
  auto time = Clock.currTime() - tm_before;
  writefln("scalar products: %i ", time);

  return 0;
}

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(8

允世 2024-10-26 16:56:13

要启用所有优化并禁用所有安全检查,请使用以下 DMD 标志编译 D 程序:

-O -inline -release -noboundscheck

编辑:我已使用 g++、dmd 和 gdc 尝试过您的程序。 dmd 确实落后,但 gdc 的性能非常接近 g++。我使用的命令行是 gdmd -O -release -inline (gdmd 是 gdc 的包装器,它接受 dmd 选项)。

查看汇编程序列表,看起来 dmd 和 gdc 都没有内联 scalar_product ,但 g++/gdc 确实发出了 MMX 指令,因此它们可能会自动向量化循环。

To enable all optimizations and disable all safety checks, compile your D program with the following DMD flags:

-O -inline -release -noboundscheck

EDIT: I've tried your programs with g++, dmd and gdc. dmd does lag behind, but gdc achieves performance very close to g++. The commandline I used was gdmd -O -release -inline (gdmd is a wrapper around gdc which accepts dmd options).

Looking at the assembler listing, it looks like neither dmd nor gdc inlined scalar_product, but g++/gdc did emit MMX instructions, so they might be auto-vectorizing the loop.

追星践月 2024-10-26 16:56:13

拖慢 D 速度的一大原因是垃圾收集实现不佳。不会对 GC 造成太大压力的基准测试将显示与使用相同编译器后端编译的 C 和 C++ 代码非常相似的性能。对 GC 造成严重压力的基准测试将显示 D 的表现很糟糕。不过请放心,这是一个单一的(尽管很严重)实施质量问题,而不是缓慢的内在保证。此外,D 使您能够选择退出 GC 并调整性能关键位中的内存管理,同时仍然在性能关键性较低的 95% 代码中使用它。

最近为提高 GC 性能付出了一些努力结果相当引人注目,至少在综合基准上是如此。希望这些更改将被集成到接下来的几个版本之一中并缓解该问题。

One big thing that slows D down is a subpar garbage collection implementation. Benchmarks that don't heavily stress the GC will show very similar performance to C and C++ code compiled with the same compiler backend. Benchmarks that do heavily stress the GC will show that D performs abysmally. Rest assured, though, this is a single (albeit severe) quality-of-implementation issue, not a baked-in guarantee of slowness. Also, D gives you the ability to opt out of GC and tune memory management in performance-critical bits, while still using it in the less performance-critical 95% of your code.

I've put some effort into improving GC performance lately and the results have been rather dramatic, at least on synthetic benchmarks. Hopefully these changes will be integrated into one of the next few releases and will mitigate the issue.

黯然#的苍凉 2024-10-26 16:56:13

这是一个非常有启发性的主题,感谢 OP 和帮助者所做的所有工作。

需要注意的是 - 该测试不是评估抽象/功能惩罚的一般问题,甚至不是后端质量的问题。它实际上专注于一种优化(循环优化)。我认为可以公平地说 gcc 的后端比 dmd 的后端更精致,但假设它们之间的差距对于所有任务都一样大是错误的。

This is a very instructive thread, thanks for all the work to the OP and helpers.

One note - this test is not assessing the general question of abstraction/feature penalty or even that of backend quality. It focuses on virtually one optimization (loop optimization). I think it's fair to say that gcc's backend is somewhat more refined than dmd's, but it would be a mistake to assume that the gap between them is as large for all tasks.

如歌彻婉言 2024-10-26 16:56:13

绝对看起来像是一个实施质量问题。

我对OP的代码进行了一些测试并做了一些更改。 对于 LDC/clang++,我实际上让 D 运行得更快,假设数组必须动态分配(xs 和关联的标量)。请参阅下面的一些数字。

OP 的问题

是否有意将相同的种子用于 C++ 的每次迭代,而 D 则不然?

设置

我对原始 D 源代码(称为 scalar.d)进行了调整,使其能够在平台之间移植。这只涉及更改用于访问和修改数组大小的数字类型。

之后,我进行了以下更改:

  • 使用 uninitializedArray 来避免 xs 中标量的默认初始化(可能是最大的区别)。 这很重要,因为 D 通常会默默地默认初始化所有内容,而 C++ 不会。

  • 分解打印代码并将 writefln 替换为 writeln

  • 将导入更改为选择性
  • 使用 pow 运算符 (^^) 而不是手动乘法计算平均值的最后一步
  • 删除了 size_type 并用新的 index_type 别名进行适当替换

...从而生成 scalar2.cpp (pastebin):

    import std.stdio : writeln;
    import std.datetime : Clock, Duration;
    import std.array : uninitializedArray;
    import std.random : uniform;

    alias result_type = long;
    alias value_type = int;
    alias vector_t = value_type[];
    alias index_type = typeof(vector_t.init.length);// Make index integrals portable - Linux is ulong, Win8.1 is uint

    immutable long N = 20000;
    immutable int size = 10;

    // Replaced for loops with appropriate foreach versions
    value_type scalar_product(in ref vector_t x, in ref vector_t y) { // "in" is the same as "const" here
      value_type res = 0;
      for(index_type i = 0; i < size; ++i)
        res += x[i] * y[i];
      return res;
    }

    int main() {
      auto tm_before = Clock.currTime;
      auto countElapsed(in string taskName) { // Factor out printing code
        writeln(taskName, ": ", Clock.currTime - tm_before);
        tm_before = Clock.currTime;
      }

      // 1. allocate and fill randomly many short vectors
      vector_t[] xs = uninitializedArray!(vector_t[])(N);// Avoid default inits of inner arrays
      for(index_type i = 0; i < N; ++i)
        xs[i] = uninitializedArray!(vector_t)(size);// Avoid more default inits of values
      countElapsed("allocation");

      for(index_type i = 0; i < N; ++i)
        for(index_type j = 0; j < size; ++j)
          xs[i][j] = uniform(-1000, 1000);
      countElapsed("random");

      // 2. compute all pairwise scalar products:
      result_type avg = 0;
      for(index_type i = 0; i < N; ++i)
        for(index_type j = 0; j < N; ++j)
          avg += scalar_product(xs[i], xs[j]);
      avg /= N ^^ 2;// Replace manual multiplication with pow operator
      writeln("result: ", avg);
      countElapsed("scalar products");

      return 0;
    }

在测试 scalar2.d (优先考虑速度优化)之后,出于好奇,我替换了 < code>main 与 foreach 等效项,并将其命名为 scalar3.d (pastebin< /a>):

    import std.stdio : writeln;
    import std.datetime : Clock, Duration;
    import std.array : uninitializedArray;
    import std.random : uniform;

    alias result_type = long;
    alias value_type = int;
    alias vector_t = value_type[];
    alias index_type = typeof(vector_t.init.length);// Make index integrals portable - Linux is ulong, Win8.1 is uint

    immutable long N = 20000;
    immutable int size = 10;

    // Replaced for loops with appropriate foreach versions
    value_type scalar_product(in ref vector_t x, in ref vector_t y) { // "in" is the same as "const" here
      value_type res = 0;
      for(index_type i = 0; i < size; ++i)
        res += x[i] * y[i];
      return res;
    }

    int main() {
      auto tm_before = Clock.currTime;
      auto countElapsed(in string taskName) { // Factor out printing code
        writeln(taskName, ": ", Clock.currTime - tm_before);
        tm_before = Clock.currTime;
      }

      // 1. allocate and fill randomly many short vectors
      vector_t[] xs = uninitializedArray!(vector_t[])(N);// Avoid default inits of inner arrays
      foreach(ref x; xs)
        x = uninitializedArray!(vector_t)(size);// Avoid more default inits of values
      countElapsed("allocation");

      foreach(ref x; xs)
        foreach(ref val; x)
          val = uniform(-1000, 1000);
      countElapsed("random");

      // 2. compute all pairwise scalar products:
      result_type avg = 0;
      foreach(const ref x; xs)
        foreach(const ref y; xs)
          avg += scalar_product(x, y);
      avg /= N ^^ 2;// Replace manual multiplication with pow operator
      writeln("result: ", avg);
      countElapsed("scalar products");

      return 0;
    }

我使用基于 LLVM 的编译器编译了每个测试,因为就性能而言,LDC 似乎是 D 编译的最佳选择。在我的 x86_64 Arch Linux 安装中,我使用了以下软件包:

  • clang 3.6.0-3
  • ldc 1:0.15.1-4
  • dtools 2.067.0-2< /code>

我使用以下命令来编译每个命令:

  • C++: clang++ scalar.cpp -o"scalar.cpp.exe" -std=c++11 -O3
  • D: rdmd - -compiler=ldc2 -O3 -boundscheck=off

结果 结果

原始控制台输出的屏幕截图) 每个版本的源代码如下:

  1. scalar.cpp (原始 C++):

    分配:2毫秒
    
    随机生成:12 ms
    
    结果:29248300000
    
    时间:2582 毫秒
    

    C++ 将标准设置为 2582 毫秒

  2. scalar.d(修改后的OP源):

    分配:5 ms、293 μs 和 5 hnsecs 
    
    随机:10 ms、866 μs 和 4 hnsecs 
    
    结果:53237080000
    
    标量积:2 秒、956 ms、513 μs 和 7 hnsecs 
    

    运行时间~2957 毫秒。比 C++ 实现慢,但也不是太慢。

  3. scalar2.d(索引/长度类型更改和 uninitializedArray 优化):

    分配:2 ms、464 μs 和 2 hnsecs
    
    随机:5 ms、792 μs 和 6 hnsecs
    
    结果:59
    
    标量积:1 秒、859 ms、942 μs 和 9 hnsecs
    

    换句话说,~1860 毫秒。到目前为止,这是领先的。

  4. scalar3.d(前叉):

    分配:2 ms、911 μs 和 3 hnsecs
    
    随机:7 ms、567 μs 和 8 hnsecs
    
    结果:189
    
    标量积:2 秒、182 毫秒和 366 μs
    

    ~2182 msscalar2.d 慢,但比 C++ 版本快。

结论

通过正确的优化,D 实现实际上比使用可用的基于 LLVM 的编译器的等效 C++ 实现更快。对于大多数应用程序来说,当前 D 和 C++ 之间的差距似乎只是基于当前实现的限制。

Definitely seems like a quality-of-implementation issue.

I ran some tests with the OP's code and made some changes. I actually got D going faster for LDC/clang++, operating on the assumption that arrays must be allocated dynamically (xs and associated scalars). See below for some numbers.

Questions for the OP

Is it intentional that the same seed be used for each iteration of C++, while not so for D?

Setup

I have tweaked the original D source (dubbed scalar.d) to make it portable between platforms. This only involved changing the type of the numbers used to access and modify the size of arrays.

After this, I made the following changes:

  • Used uninitializedArray to avoid default inits for scalars in xs (probably made the biggest difference). This is important because D normally default-inits everything silently, which C++ does not.

  • Factored out printing code and replaced writefln with writeln

  • Changed imports to be selective
  • Used pow operator (^^) instead of manual multiplication for final step of calculating average
  • Removed the size_type and replaced appropriately with the new index_type alias

...thus resulting in scalar2.cpp (pastebin):

    import std.stdio : writeln;
    import std.datetime : Clock, Duration;
    import std.array : uninitializedArray;
    import std.random : uniform;

    alias result_type = long;
    alias value_type = int;
    alias vector_t = value_type[];
    alias index_type = typeof(vector_t.init.length);// Make index integrals portable - Linux is ulong, Win8.1 is uint

    immutable long N = 20000;
    immutable int size = 10;

    // Replaced for loops with appropriate foreach versions
    value_type scalar_product(in ref vector_t x, in ref vector_t y) { // "in" is the same as "const" here
      value_type res = 0;
      for(index_type i = 0; i < size; ++i)
        res += x[i] * y[i];
      return res;
    }

    int main() {
      auto tm_before = Clock.currTime;
      auto countElapsed(in string taskName) { // Factor out printing code
        writeln(taskName, ": ", Clock.currTime - tm_before);
        tm_before = Clock.currTime;
      }

      // 1. allocate and fill randomly many short vectors
      vector_t[] xs = uninitializedArray!(vector_t[])(N);// Avoid default inits of inner arrays
      for(index_type i = 0; i < N; ++i)
        xs[i] = uninitializedArray!(vector_t)(size);// Avoid more default inits of values
      countElapsed("allocation");

      for(index_type i = 0; i < N; ++i)
        for(index_type j = 0; j < size; ++j)
          xs[i][j] = uniform(-1000, 1000);
      countElapsed("random");

      // 2. compute all pairwise scalar products:
      result_type avg = 0;
      for(index_type i = 0; i < N; ++i)
        for(index_type j = 0; j < N; ++j)
          avg += scalar_product(xs[i], xs[j]);
      avg /= N ^^ 2;// Replace manual multiplication with pow operator
      writeln("result: ", avg);
      countElapsed("scalar products");

      return 0;
    }

After testing scalar2.d (which prioritized optimization for speed), out of curiousity I replaced the loops in main with foreach equivalents, and called it scalar3.d (pastebin):

    import std.stdio : writeln;
    import std.datetime : Clock, Duration;
    import std.array : uninitializedArray;
    import std.random : uniform;

    alias result_type = long;
    alias value_type = int;
    alias vector_t = value_type[];
    alias index_type = typeof(vector_t.init.length);// Make index integrals portable - Linux is ulong, Win8.1 is uint

    immutable long N = 20000;
    immutable int size = 10;

    // Replaced for loops with appropriate foreach versions
    value_type scalar_product(in ref vector_t x, in ref vector_t y) { // "in" is the same as "const" here
      value_type res = 0;
      for(index_type i = 0; i < size; ++i)
        res += x[i] * y[i];
      return res;
    }

    int main() {
      auto tm_before = Clock.currTime;
      auto countElapsed(in string taskName) { // Factor out printing code
        writeln(taskName, ": ", Clock.currTime - tm_before);
        tm_before = Clock.currTime;
      }

      // 1. allocate and fill randomly many short vectors
      vector_t[] xs = uninitializedArray!(vector_t[])(N);// Avoid default inits of inner arrays
      foreach(ref x; xs)
        x = uninitializedArray!(vector_t)(size);// Avoid more default inits of values
      countElapsed("allocation");

      foreach(ref x; xs)
        foreach(ref val; x)
          val = uniform(-1000, 1000);
      countElapsed("random");

      // 2. compute all pairwise scalar products:
      result_type avg = 0;
      foreach(const ref x; xs)
        foreach(const ref y; xs)
          avg += scalar_product(x, y);
      avg /= N ^^ 2;// Replace manual multiplication with pow operator
      writeln("result: ", avg);
      countElapsed("scalar products");

      return 0;
    }

I compiled each of these tests using an LLVM-based compiler, since LDC seems to be the best option for D compilation in terms of performance. On my x86_64 Arch Linux installation I used the following packages:

  • clang 3.6.0-3
  • ldc 1:0.15.1-4
  • dtools 2.067.0-2

I used the following commands to compile each:

  • C++: clang++ scalar.cpp -o"scalar.cpp.exe" -std=c++11 -O3
  • D: rdmd --compiler=ldc2 -O3 -boundscheck=off <sourcefile>

Results

The results (screenshot of raw console output) of each version of the source as follows:

  1. scalar.cpp (original C++):

    allocation: 2 ms
    
    random generation: 12 ms
    
    result: 29248300000
    
    time: 2582 ms
    

    C++ sets the standard at 2582 ms.

  2. scalar.d (modified OP source):

    allocation: 5 ms, 293 μs, and 5 hnsecs 
    
    random: 10 ms, 866 μs, and 4 hnsecs 
    
    result: 53237080000
    
    scalar products: 2 secs, 956 ms, 513 μs, and 7 hnsecs 
    

    This ran for ~2957 ms. Slower than the C++ implementation, but not too much.

  3. scalar2.d (index/length type change and uninitializedArray optimization):

    allocation: 2 ms, 464 μs, and 2 hnsecs
    
    random: 5 ms, 792 μs, and 6 hnsecs
    
    result: 59
    
    scalar products: 1 sec, 859 ms, 942 μs, and 9 hnsecs
    

    In other words, ~1860 ms. So far this is in the lead.

  4. scalar3.d (foreaches):

    allocation: 2 ms, 911 μs, and 3 hnsecs
    
    random: 7 ms, 567 μs, and 8 hnsecs
    
    result: 189
    
    scalar products: 2 secs, 182 ms, and 366 μs
    

    ~2182 ms is slower than scalar2.d, but faster than the C++ version.

Conclusion

With the correct optimizations, the D implementation actually went faster than its equivalent C++ implementation using the LLVM-based compilers available. The current gap between D and C++ for most applications seems only to be based on limitations of current implementations.

套路撩心 2024-10-26 16:56:13

dmd 是该语言的参考实现,因此大部分工作都放在前端来修复错误,而不是优化后端。

在您的情况下,“in”更快,因为您使用的是引用类型的动态数组。使用 ref 引入了另一个间接级别(通常用于更改数组本身而不仅仅是内容)。

向量通常使用结构体来实现,其中 const ref 非常有意义。请参阅 smallptDsmallpt 是一个具有大量向量运算和随机性的现实示例。

请注意,64 位也可以产生影响。我曾经错过了在 x64 上 gcc 编译 64 位代码,而 dmd 仍然默认为 32(当 64 位代码生成成熟时会改变)。使用“dmd -m64 ...”有显着的加速。

dmd is the reference implementation of the language and thus most work is put into the frontend to fix bugs rather than optimizing the backend.

"in" is faster in your case cause you are using dynamic arrays which are reference types. With ref you introduce another level of indirection (which is normally used to alter the array itself and not only the contents).

Vectors are usually implemented with structs where const ref makes perfect sense. See smallptD vs. smallpt for a real-world example featuring loads of vector operations and randomness.

Note that 64-Bit can also make a difference. I once missed that on x64 gcc compiles 64-Bit code while dmd still defaults to 32 (will change when the 64-Bit codegen matures). There was a remarkable speedup with "dmd -m64 ...".

姐不稀罕 2024-10-26 16:56:13

C++ 还是 D 更快可能很大程度上取决于您正在做什么。我认为,当将编写良好的 C++ 与编写良好的 D 代码进行比较时,它们通常要么具有相似的速度,要么 C++ 会更快,但是特定编译器设法优化的内容可能会产生完全与语言无关的巨大影响本身。

然而,在某些情况下,D 在速度上很有可能击败 C++。我想到的主要是字符串处理。由于 D 的数组切片功能,字符串(以及一般数组)的处理速度比 C++ 中的处理速度要快得多。对于 D1, Tango 的 XML 处理器极其,这主要归功于 D 的数组切片功能(希望在当前为 Phobos 开发的解析器完成后,D2 也能拥有类似快速的 XML 解析器) 。因此,最终 D 或 C++ 是否会更快将很大程度上取决于您正在做什么。

现在,我惊讶于您在这种特殊情况下看到了如此大的速度差异,但我希望随着 dmd 的改进,这种情况也会得到改善。使用 gdc 可能会产生更好的结果,并且可能会更接近语言本身(而不是后端),因为它是基于 gcc 的。但如果可以采取许多措施来加速 dmd 生成的代码,我一点也不会感到惊讶。我认为目前 gcc 比 dmd 更成熟这一点没有什么问题。代码优化是代码成熟度的主要成果之一。

最终,重要的是 dmd 对于您的特定应用程序的执行情况,但我确实同意,了解 C++ 和 D 一般情况下的比较情况肯定会很好。理论上,它们应该几乎相同,但这实际上取决于实现。我认为需要一套全面的基准来真正测试两者目前的比较效果。

Whether C++ or D is faster is likely to be highly dependent on what you're doing. I would think that when comparing well-written C++ to well-written D code, they would generally either be of similar speed, or C++ would be faster, but what the particular compiler manages to optimize could have a big effect completely aside from the language itself.

However, there are a few cases where D stands a good chance of beating C++ for speed. The main one which comes to mind would be string processing. Thanks to D's array slicing capabalities, strings (and arrays in general) can be processed much faster than you can readily do in C++. For D1, Tango's XML processor is extremely fast, thanks primarily to D's array slicing capabilities (and hopefully D2 will have a similarly fast XML parser once the one that's currently being worked on for Phobos has been completed). So, ultimately whether D or C++ is going to be faster is going to be very dependent on what you're doing.

Now, I am suprised that you're seeing such a difference in speed in this particular case, but it is the sort of thing that I would expect to improve as dmd improves. Using gdc might yield better results and would likely be a closer comparison of the language itself (rather than the backend) given that it's gcc-based. But it wouldn't surprise me at all if there are a number of things which could be done to speed up the code that dmd generates. I don't think that there's much question that gcc is more mature than dmd at this point. And code optimizations are one of the prime fruits of code maturity.

Ultimately, what matters is how well dmd performs for your particular application, but I do agree that it would definitely be nice to know how well C++ and D compare in general. In theory, they should be pretty much the same, but it really depends on the implementation. I think that a comprehensive set of benchmarks would be required to really test how well the two presently compare however.

一梦浮鱼 2024-10-26 16:56:13

你可以写 C 代码是 D ,至于哪个更快,这取决于很多事情:

  • 你使用什么编译器你使用
  • 什么功能
  • 你优化的积极程度

第一个中的差异是不公平的。第二个可能会给 C++ 带来优势,因为它(如果有的话)具有较少的重要功能。第三个是有趣的:D 代码在某些方面更容易优化,因为通常它更容易理解。它还能够进行大量的生成式编程,允许以更短的形式编写冗长、重复但快速的代码。

You can write C code is D so as far as which is faster, it will depend on a lot of things:

  • What compiler you use
  • What feature you use
  • how aggressively you optimize

Differences in the first aren't fair to drag in. The second might give C++ an advantage as it, if anything, has fewer heavy features. The third is the fun one: D code in some ways is easier to optimize because in general it is easier to understand. Also it has the ability to do a large degree of generative programing allowing things like verbose and repetitive but fast code to be written in a shorter forms.

白色秋天 2024-10-26 16:56:13

似乎是实施质量问题。例如,这是我一直在测试的内容:

import std.datetime, std.stdio, std.random;

version = ManualInline;

immutable N = 20000;
immutable Size = 10;

alias int value_type;
alias long result_type;
alias value_type[] vector_type;

result_type scalar_product(in vector_type x, in vector_type y)
in
{
    assert(x.length == y.length);
}
body
{
    result_type result = 0;

    foreach(i; 0 .. x.length)
        result += x[i] * y[i];

    return result;
}

void main()
{   
    auto startTime = Clock.currTime();

    // 1. allocate vectors
    vector_type[] vectors = new vector_type[N];
    foreach(ref vec; vectors)
        vec = new value_type[Size];

    auto time = Clock.currTime() - startTime;
    writefln("allocation: %s ", time);
    startTime = Clock.currTime();

    // 2. randomize vectors
    foreach(ref vec; vectors)
        foreach(ref e; vec)
            e = uniform(-1000, 1000);

    time = Clock.currTime() - startTime;
    writefln("random: %s ", time);
    startTime = Clock.currTime();

    // 3. compute all pairwise scalar products
    result_type avg = 0;

    foreach(vecA; vectors)
        foreach(vecB; vectors)
        {
            version(ManualInline)
            {
                result_type result = 0;

                foreach(i; 0 .. vecA.length)
                    result += vecA[i] * vecB[i];

                avg += result;
            }
            else
            {
                avg += scalar_product(vecA, vecB);
            }
        }

    avg = avg / (N * N);

    time = Clock.currTime() - startTime;
    writefln("scalar products: %s ", time);
    writefln("result: %s", avg);
}

定义了 ManualInline 后,我得到了 28 秒,但没有定义,我得到了 32 秒。所以编译器甚至没有内联这个简单的函数,我认为这一点很清楚应该是的。

(我的命令行是 dmd -O -noboundscheck -inline -release ...。)

Seems like a quality of implementation issue. For example, here's what I've been testing with:

import std.datetime, std.stdio, std.random;

version = ManualInline;

immutable N = 20000;
immutable Size = 10;

alias int value_type;
alias long result_type;
alias value_type[] vector_type;

result_type scalar_product(in vector_type x, in vector_type y)
in
{
    assert(x.length == y.length);
}
body
{
    result_type result = 0;

    foreach(i; 0 .. x.length)
        result += x[i] * y[i];

    return result;
}

void main()
{   
    auto startTime = Clock.currTime();

    // 1. allocate vectors
    vector_type[] vectors = new vector_type[N];
    foreach(ref vec; vectors)
        vec = new value_type[Size];

    auto time = Clock.currTime() - startTime;
    writefln("allocation: %s ", time);
    startTime = Clock.currTime();

    // 2. randomize vectors
    foreach(ref vec; vectors)
        foreach(ref e; vec)
            e = uniform(-1000, 1000);

    time = Clock.currTime() - startTime;
    writefln("random: %s ", time);
    startTime = Clock.currTime();

    // 3. compute all pairwise scalar products
    result_type avg = 0;

    foreach(vecA; vectors)
        foreach(vecB; vectors)
        {
            version(ManualInline)
            {
                result_type result = 0;

                foreach(i; 0 .. vecA.length)
                    result += vecA[i] * vecB[i];

                avg += result;
            }
            else
            {
                avg += scalar_product(vecA, vecB);
            }
        }

    avg = avg / (N * N);

    time = Clock.currTime() - startTime;
    writefln("scalar products: %s ", time);
    writefln("result: %s", avg);
}

With ManualInline defined I get 28 seconds, but without I get 32. So the compiler isn't even inlining this simple function, which I think it's clear it should be.

(My command line is dmd -O -noboundscheck -inline -release ....)

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文