gcc/g++ 中访问局部变量与全局变量的速度在不同的优化级别

发布于 2024-12-01 20:04:27 字数 6711 浏览 3 评论 0原文

我发现在循环中访问本地或全局变量时，gcc 中不同的编译器优化级别会给出截然不同的结果。这让我感到惊讶的原因是，如果对一种类型变量的访问比对另一种类型变量的访问更优化，我认为 gcc 优化会利用这一事实。这里有两个例子（在 C++ 中，但它们的 C 对应物给出了几乎相同的计时）：

    global = 0;
    for (int i = 0; i < SIZE; i++)
        global++;

它使用全局变量 long global，与

    long tmp = 0;
    for (int i = 0; i < SIZE; i++)
        tmp++;
    global = tmp;

在优化级别 -O0 的计时基本相同（正如我所期望的），在 -O1 时速度稍快，但仍然相同，但从 -O2 开始，使用全局变量的版本要快得多（大约是 7 倍）。

另一方面，在以下代码片段中，start 指向大小为 SIZE: 的字节块，

    global = 0;
    for (const char* p = start; p < start + SIZE; p++)
        global += *p;

与 Here at -O0相比

    long tmp = 0;
    for (const char* p = start; p < start + SIZE; p++)
        tmp += *p;
    global = tmp;

，时间很接近，尽管使用局部变量的版本稍快一些，这似乎并不令人意外，因为它可能会存储在寄存器中，而 global 不会。然后，在 -O1 及更高版本中，使用局部变量的版本要快得多（超过 50% 或 1.5 倍）。正如之前所说，这让我感到惊讶，因为我认为对于 gcc 来说，使用局部变量（在生成的优化代码中）稍后分配给全局变量一样容易。

所以我的问题是：全局变量和局部变量是什么使得 gcc 只能对一种类型执行某些优化，而不能对另一种类型执行某些优化？

一些可能相关或不相关的细节：我在运行 RHEL4、具有两个单核处理器和 4GB RAM 的计算机上使用了 gcc/g++ 版本 3.4.5。我使用的 SIZE（预处理器宏）值是 1000000000。第二个示例中的字节块是动态分配的。

以下是优化级别 0 到 4 的一些计时输出（与上面的顺序相同）：

$ ./st0
Result using global variable: 1000000000 in 2.213 seconds.
Result using local variable:  1000000000 in 2.210 seconds.
Result using global variable: 0 in 3.924 seconds.
Result using local variable:  0 in 3.710 seconds.
$ ./st1
Result using global variable: 1000000000 in 0.947 seconds.
Result using local variable:  1000000000 in 0.947 seconds.
Result using global variable: 0 in 2.135 seconds.
Result using local variable:  0 in 1.212 seconds.
$ ./st2
Result using global variable: 1000000000 in 0.022 seconds.
Result using local variable:  1000000000 in 0.552 seconds.
Result using global variable: 0 in 2.135 seconds.
Result using local variable:  0 in 1.227 seconds.
$ ./st3
Result using global variable: 1000000000 in 0.065 seconds.
Result using local variable:  1000000000 in 0.461 seconds.
Result using global variable: 0 in 2.453 seconds.
Result using local variable:  0 in 1.646 seconds.
$ ./st4
Result using global variable: 1000000000 in 0.063 seconds.
Result using local variable:  1000000000 in 0.468 seconds.
Result using global variable: 0 in 2.467 seconds.
Result using local variable:  0 in 1.663 seconds.

编辑这是使用开关 -O2 为前两个片段生成的程序集，这是差异最大的情况。据我了解，它看起来像是编译器中的一个错误：0x3b9aca00 是十六进制的 SIZE，0x80496dc 必须是全局地址。我检查了较新的编译器，这种情况不再发生了。然而，第二对片段的差异是相似的。

    void global1()
    {
        int i;
        global = 0;
        for (i = 0; i < SIZE; i++)
            global++;
    }

    void local1()
    {
        int i;
        long tmp = 0;
        for (i = 0; i < SIZE; i++)
            tmp++;
        global = tmp;
    }

    080483d0 <global1>:
     80483d0:   55                      push   %ebp
     80483d1:   89 e5                   mov    %esp,%ebp
     80483d3:   c7 05 dc 96 04 08 00    movl   $0x0,0x80496dc
     80483da:   00 00 00 
     80483dd:   b8 ff c9 9a 3b          mov    $0x3b9ac9ff,%eax
     80483e2:   89 f6                   mov    %esi,%esi
     80483e4:   83 e8 19                sub    $0x19,%eax
     80483e7:   79 fb                   jns    80483e4 <global1+0x14>
     80483e9:   c7 05 dc 96 04 08 00    movl   $0x3b9aca00,0x80496dc
     80483f0:   ca 9a 3b 
     80483f3:   c9                      leave  
     80483f4:   c3                      ret    
     80483f5:   8d 76 00                lea    0x0(%esi),%esi

    080483f8 <local1>:
     80483f8:   55                      push   %ebp
     80483f9:   89 e5                   mov    %esp,%ebp
     80483fb:   b8 ff c9 9a 3b          mov    $0x3b9ac9ff,%eax
     8048400:   48                      dec    %eax
     8048401:   79 fd                   jns    8048400 <local1+0x8>
     8048403:   c7 05 dc 96 04 08 00    movl   $0x3b9aca00,0x80496dc
     804840a:   ca 9a 3b 
     804840d:   c9                      leave  
     804840e:   c3                      ret    
     804840f:   90                      nop

最后，这是剩余片段的代码，现在由 gcc 4.3.3 使用 -O3 生成（尽管旧版本似乎生成类似的代码）。看起来确实 global2(..) 编译为一个在循环的每次迭代中访问全局内存位置的函数，其中 local2(..) 使用寄存器。我仍然不清楚为什么 gcc 不会使用寄存器来优化全局版本。这只是一个缺乏的功能，还是真的会导致可执行文件出现不可接受的行为？

    void global2(const char* start)
    {
        const char* p;
        global = 0;
        for (p = start; p < start + SIZE; p++)
            global += *p;
    }

    void local2(const char* start)
    {
        const char* p;
        long tmp = 0;
        for (p = start; p < start + SIZE; p++)
            tmp += *p;
        global = tmp;
    }

    08048470 <global2>:
     8048470:   55                      push   %ebp
     8048471:   31 d2                   xor    %edx,%edx
     8048473:   89 e5                   mov    %esp,%ebp
     8048475:   8b 4d 08                mov    0x8(%ebp),%ecx
     8048478:   c7 05 24 a0 04 08 00    movl   $0x0,0x804a024
     804847f:   00 00 00 
     8048482:   8d b6 00 00 00 00       lea    0x0(%esi),%esi
     8048488:   0f be 04 11             movsbl (%ecx,%edx,1),%eax
     804848c:   83 c2 01                add    $0x1,%edx
     804848f:   01 05 24 a0 04 08       add    %eax,0x804a024
     8048495:   81 fa 00 ca 9a 3b       cmp    $0x3b9aca00,%edx
     804849b:   75 eb                   jne    8048488 <global2+0x18>
     804849d:   5d                      pop    %ebp
     804849e:   c3                      ret    
     804849f:   90                      nop    

    080484a0 <local2>:
     80484a0:   55                      push   %ebp
     80484a1:   31 c9                   xor    %ecx,%ecx
     80484a3:   89 e5                   mov    %esp,%ebp
     80484a5:   31 d2                   xor    %edx,%edx
     80484a7:   53                      push   %ebx
     80484a8:   8b 5d 08                mov    0x8(%ebp),%ebx
     80484ab:   90                      nop    
     80484ac:   8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
     80484b0:   0f be 04 13             movsbl (%ebx,%edx,1),%eax
     80484b4:   83 c2 01                add    $0x1,%edx
     80484b7:   01 c1                   add    %eax,%ecx
     80484b9:   81 fa 00 ca 9a 3b       cmp    $0x3b9aca00,%edx
     80484bf:   75 ef                   jne    80484b0 <local2+0x10>
     80484c1:   5b                      pop    %ebx
     80484c2:   89 0d 24 a0 04 08       mov    %ecx,0x804a024
     80484c8:   5d                      pop    %ebp
     80484c9:   c3                      ret    
     80484ca:   8d b6 00 00 00 00       lea    0x0(%esi),%esi

谢谢。

原文

I found that different compiler optimization levels in gcc give quite different results when accessing a local or a global variable in a loop. The reason this surprised me is that if access to one type of variable is more optimizable than access to another, I would think gcc optimization would exploit that fact.
Here come two examples (in C++ but their C counterparts give practically the same timings):

    global = 0;
    for (int i = 0; i < SIZE; i++)
        global++;

which uses a global variable long global, versus

    long tmp = 0;
    for (int i = 0; i < SIZE; i++)
        tmp++;
    global = tmp;

At optimization level -O0 the timing is essentially equal (as I would expect), at -O1 it is somewhat faster but still equal, but from -O2 the version using the global variable is much faster (a factor 7 or so).

On the other hand, in the following code fragments where start points to a block of bytes of size SIZE:

    global = 0;
    for (const char* p = start; p < start + SIZE; p++)
        global += *p;

versus

    long tmp = 0;
    for (const char* p = start; p < start + SIZE; p++)
        tmp += *p;
    global = tmp;

Here at -O0 the timings are close, though the version using the local variable is slightly faster, which doesn't seem too surprising, as maybe it will be stored in a register, whereas global wouldn't. Then at -O1 and higher the version using a local variable is considerably faster (more than 50% or 1.5 times). As remarked before, this surprises me, because I would think that for gcc it would be as easy as for me to use a local variable (in the generated optimized code) to assign to the global one later on.

So my question is: what is it about global and local variables that makes that gcc can only perform certain optimizations to one type, not the other?

Some details that may or may not be relevant: I used gcc/g++ version 3.4.5 on a machine running RHEL4 with two single core processors and 4GB RAM. The value I used for SIZE, which is a preprocessor macro, was 1000000000. The block of bytes in the second example was dynamically allocated.

Here are some timing outputs for optimization levels 0 to 4 (in the same order as above):

$ ./st0
Result using global variable: 1000000000 in 2.213 seconds.
Result using local variable:  1000000000 in 2.210 seconds.
Result using global variable: 0 in 3.924 seconds.
Result using local variable:  0 in 3.710 seconds.
$ ./st1
Result using global variable: 1000000000 in 0.947 seconds.
Result using local variable:  1000000000 in 0.947 seconds.
Result using global variable: 0 in 2.135 seconds.
Result using local variable:  0 in 1.212 seconds.
$ ./st2
Result using global variable: 1000000000 in 0.022 seconds.
Result using local variable:  1000000000 in 0.552 seconds.
Result using global variable: 0 in 2.135 seconds.
Result using local variable:  0 in 1.227 seconds.
$ ./st3
Result using global variable: 1000000000 in 0.065 seconds.
Result using local variable:  1000000000 in 0.461 seconds.
Result using global variable: 0 in 2.453 seconds.
Result using local variable:  0 in 1.646 seconds.
$ ./st4
Result using global variable: 1000000000 in 0.063 seconds.
Result using local variable:  1000000000 in 0.468 seconds.
Result using global variable: 0 in 2.467 seconds.
Result using local variable:  0 in 1.663 seconds.

EDIT
This is the generated assembly for the first two snippets with switch -O2, the case where the difference is largest. For as far as I understand, it looks like a bug in the compiler: 0x3b9aca00 is SIZE in hexadecimal, 0x80496dc must be the address of global.
I checked with a newer compiler, and this doesn't happen anymore. The difference in the second pair of snippets is similar however.

    void global1()
    {
        int i;
        global = 0;
        for (i = 0; i < SIZE; i++)
            global++;
    }

    void local1()
    {
        int i;
        long tmp = 0;
        for (i = 0; i < SIZE; i++)
            tmp++;
        global = tmp;
    }

    080483d0 <global1>:
     80483d0:   55                      push   %ebp
     80483d1:   89 e5                   mov    %esp,%ebp
     80483d3:   c7 05 dc 96 04 08 00    movl   $0x0,0x80496dc
     80483da:   00 00 00 
     80483dd:   b8 ff c9 9a 3b          mov    $0x3b9ac9ff,%eax
     80483e2:   89 f6                   mov    %esi,%esi
     80483e4:   83 e8 19                sub    $0x19,%eax
     80483e7:   79 fb                   jns    80483e4 <global1+0x14>
     80483e9:   c7 05 dc 96 04 08 00    movl   $0x3b9aca00,0x80496dc
     80483f0:   ca 9a 3b 
     80483f3:   c9                      leave  
     80483f4:   c3                      ret    
     80483f5:   8d 76 00                lea    0x0(%esi),%esi

    080483f8 <local1>:
     80483f8:   55                      push   %ebp
     80483f9:   89 e5                   mov    %esp,%ebp
     80483fb:   b8 ff c9 9a 3b          mov    $0x3b9ac9ff,%eax
     8048400:   48                      dec    %eax
     8048401:   79 fd                   jns    8048400 <local1+0x8>
     8048403:   c7 05 dc 96 04 08 00    movl   $0x3b9aca00,0x80496dc
     804840a:   ca 9a 3b 
     804840d:   c9                      leave  
     804840e:   c3                      ret    
     804840f:   90                      nop

Finally here is the code of the remaining snippets, now generated by gcc 4.3.3 using -O3 (though the old version seems to generate similar code). It looks like indeed global2(..) compiles to a function accessing the global memory location in every iteration of the loop, where local2(..) uses a register. It is still not clear to me why gcc wouldn't optimize the global version using a register anyway. Is this just a lacking feature, or would it really lead to unacceptable behaviour of the executable?

    void global2(const char* start)
    {
        const char* p;
        global = 0;
        for (p = start; p < start + SIZE; p++)
            global += *p;
    }

    void local2(const char* start)
    {
        const char* p;
        long tmp = 0;
        for (p = start; p < start + SIZE; p++)
            tmp += *p;
        global = tmp;
    }

    08048470 <global2>:
     8048470:   55                      push   %ebp
     8048471:   31 d2                   xor    %edx,%edx
     8048473:   89 e5                   mov    %esp,%ebp
     8048475:   8b 4d 08                mov    0x8(%ebp),%ecx
     8048478:   c7 05 24 a0 04 08 00    movl   $0x0,0x804a024
     804847f:   00 00 00 
     8048482:   8d b6 00 00 00 00       lea    0x0(%esi),%esi
     8048488:   0f be 04 11             movsbl (%ecx,%edx,1),%eax
     804848c:   83 c2 01                add    $0x1,%edx
     804848f:   01 05 24 a0 04 08       add    %eax,0x804a024
     8048495:   81 fa 00 ca 9a 3b       cmp    $0x3b9aca00,%edx
     804849b:   75 eb                   jne    8048488 <global2+0x18>
     804849d:   5d                      pop    %ebp
     804849e:   c3                      ret    
     804849f:   90                      nop    

    080484a0 <local2>:
     80484a0:   55                      push   %ebp
     80484a1:   31 c9                   xor    %ecx,%ecx
     80484a3:   89 e5                   mov    %esp,%ebp
     80484a5:   31 d2                   xor    %edx,%edx
     80484a7:   53                      push   %ebx
     80484a8:   8b 5d 08                mov    0x8(%ebp),%ebx
     80484ab:   90                      nop    
     80484ac:   8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
     80484b0:   0f be 04 13             movsbl (%ebx,%edx,1),%eax
     80484b4:   83 c2 01                add    $0x1,%edx
     80484b7:   01 c1                   add    %eax,%ecx
     80484b9:   81 fa 00 ca 9a 3b       cmp    $0x3b9aca00,%edx
     80484bf:   75 ef                   jne    80484b0 <local2+0x10>
     80484c1:   5b                      pop    %ebx
     80484c2:   89 0d 24 a0 04 08       mov    %ecx,0x804a024
     80484c8:   5d                      pop    %ebp
     80484c9:   c3                      ret    
     80484ca:   8d b6 00 00 00 00       lea    0x0(%esi),%esi

Thanks.

分享到QQ

分享到微博