附加指令比处理器本身快

发布于 2025-01-31 17:21:08 字数 4449 浏览 1 评论 0原文

我有此代码（添加了一些指令以实现基准公平性）：

.global count_forloop
.global count_addloop
.global count_mulloop
.global count_divloop

count_forloop:
    push %rsi
    push %rcx
    mov $0xFFFFFFFF, %rsi
    add %rsi, %rsi
    xor %eax, %eax
_count_forloop1:
    inc %rax
    cmp $10000000, %rax
    jne _count_forloop1
    pop %rcx
    pop %rsi
    ret

count_addloop:
    push %rsi
    push %rcx
    mov $0xFFFFFFFF, %rsi
    add %rsi, %rsi
    xor %eax, %eax
    xor %ecx, %ecx
_count_addloop1:
    inc %rax
    add $3, %rcx # Benchmark this instruction
    cmp $10000000, %rax
    jne _count_addloop1
    pop %rcx
    pop %rsi
    ret

count_mulloop:
    push %rsi
    push %rcx
    mov $0xFFFFFFFF, %rsi
    add %rsi, %rsi
    xor %eax, %eax
    xor %ecx, %ecx
    add $4, %ecx
_count_mulloop1:
    inc %rax
    imul $3, %rcx # Benchmark this instruction
    cmp $10000000, %rax
    jne _count_mulloop1
    pop %rcx
    pop %rsi
    ret

count_divloop:
    push %rsi
    push %rcx
    mov $0xFFFFFFFF, %rsi
    add %rsi, %rsi
    xor %ecx, %ecx
    add $1, %rcx
_count_divloop1:
    inc %rcx
    div %rsi # Benchmark this instruction
    cmp $10000000, %rcx
    jne _count_divloop1
    pop %rcx
    pop %rsi
    ret

我

#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>

#define N 1000

void count_forloop(void);
void count_addloop(void);
void count_mulloop(void);
void count_divloop(void);

uint64_t ns(void);

int main(int argc, char** argv) {
    uint64_t start_time_for = ns();
    for (int i = 0; i < N; i++)
        count_forloop();
    uint64_t end_time_for = ns();
    uint64_t diff_for = (end_time_for - start_time_for) / N;
    printf("10.000.000 iterations of empty forloop: %" PRIu64 "ns\n", diff_for);
    uint64_t start_time_add = ns();
    for (int i = 0; i < N; i++)
        count_addloop();
    uint64_t end_time_add = ns();
    uint64_t diff_add = (end_time_add - start_time_add) / N;
    printf("10.000.000 iterations of addloop: %" PRIu64 "ns\n", diff_add);
    printf("10.000.000 iterations of addloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_add - diff_for);
    uint64_t start_time_mul = ns();
    for (int i = 0; i < N; i++)
        count_mulloop();
    uint64_t end_time_mul = ns();
    uint64_t diff_mul = (end_time_mul - start_time_mul) / N;
    printf("10.000.000 iterations of mulloop: %" PRIu64 "ns\n", diff_mul);
    printf("10.000.000 iterations of mulloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_mul - diff_for);
    uint64_t start_time_div = ns();
    for (int i = 0; i < N; i++) {
        count_divloop();
    }
    uint64_t end_time_div = ns();
    uint64_t diff_div = (end_time_div - start_time_div) / N;
    printf("10.000.000 iterations of divloop: %" PRIu64 "ns\n", diff_div);
    printf("10.000.000 iterations of divloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_div - diff_for);
    double real_add = (diff_add - diff_for) / 10000000.0;
    double real_mul = (diff_mul - diff_for) / 10000000.0;
    double real_div = (diff_div - diff_for) / 10000000.0;
    printf ("Add: %lfns\n", real_add);
    printf ("Mul: %lfns\n", real_mul);
    printf ("Div: %lfns\n", real_div);
    printf ("Mul/Add = %lf\n", real_mul / real_add);
    printf ("Div/Add = %lf\n", real_div / real_add);
}

uint64_t ns(void) {
    struct timespec t;
    clock_gettime(CLOCK_REALTIME, &t);
    return (uint64_t)(t.tv_sec) * (uint64_t)1000000000 + (uint64_t)(t.tv_nsec);
}

想基准/比较长度，例如添加，乘法和分配。

10.000.000 iterations of empty forloop: 2415544ns
10.000.000 iterations of addloop: 3074961ns
10.000.000 iterations of addloop: 659417ns (Excluding for-loop)
10.000.000 iterations of mulloop: 7177428ns
10.000.000 iterations of mulloop: 4761884ns (Excluding for-loop)
10.000.000 iterations of divloop: 43092662ns
10.000.000 iterations of divloop: 40677118ns (Excluding for-loop)
Add: 0.065942ns # Here, this is weird
Mul: 0.476188ns
Div: 4.067712ns
Mul/Add = 7.221355
Div/Add = 61.686487

首先，我只是基于count_forloop的调用，以便获得所有指令的性能，除了我要测量的指令，然后我测量了不同功能的时间，减去空的for-for-lor-for-lor-for-lor-for-lor-for-lor-livide，然后除去直到i有时间执行指令。

我的处理器能够达到4.2GHz，在运行此程序时，该值几乎可以在一个核心上达到。

假设，它能够达到这4.2GHz，这意味着，每个周期都采用2.381e-10秒，或大约0.24纳秒左右。

但是，如您所见，添加指令仅需要0.065942NS，因此仅占一个周期的三分之一。

我尝试多次运行编程，但是我总是得到相同的结果，add比处理器本身快。

我在计算中找不到任何错误。

原文

I have this code (Some instructions are added for benchmark fairness):

.global count_forloop
.global count_addloop
.global count_mulloop
.global count_divloop

count_forloop:
    push %rsi
    push %rcx
    mov $0xFFFFFFFF, %rsi
    add %rsi, %rsi
    xor %eax, %eax
_count_forloop1:
    inc %rax
    cmp $10000000, %rax
    jne _count_forloop1
    pop %rcx
    pop %rsi
    ret

count_addloop:
    push %rsi
    push %rcx
    mov $0xFFFFFFFF, %rsi
    add %rsi, %rsi
    xor %eax, %eax
    xor %ecx, %ecx
_count_addloop1:
    inc %rax
    add $3, %rcx # Benchmark this instruction
    cmp $10000000, %rax
    jne _count_addloop1
    pop %rcx
    pop %rsi
    ret

count_mulloop:
    push %rsi
    push %rcx
    mov $0xFFFFFFFF, %rsi
    add %rsi, %rsi
    xor %eax, %eax
    xor %ecx, %ecx
    add $4, %ecx
_count_mulloop1:
    inc %rax
    imul $3, %rcx # Benchmark this instruction
    cmp $10000000, %rax
    jne _count_mulloop1
    pop %rcx
    pop %rsi
    ret

count_divloop:
    push %rsi
    push %rcx
    mov $0xFFFFFFFF, %rsi
    add %rsi, %rsi
    xor %ecx, %ecx
    add $1, %rcx
_count_divloop1:
    inc %rcx
    div %rsi # Benchmark this instruction
    cmp $10000000, %rcx
    jne _count_divloop1
    pop %rcx
    pop %rsi
    ret

and

#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>

#define N 1000

void count_forloop(void);
void count_addloop(void);
void count_mulloop(void);
void count_divloop(void);

uint64_t ns(void);

int main(int argc, char** argv) {
    uint64_t start_time_for = ns();
    for (int i = 0; i < N; i++)
        count_forloop();
    uint64_t end_time_for = ns();
    uint64_t diff_for = (end_time_for - start_time_for) / N;
    printf("10.000.000 iterations of empty forloop: %" PRIu64 "ns\n", diff_for);
    uint64_t start_time_add = ns();
    for (int i = 0; i < N; i++)
        count_addloop();
    uint64_t end_time_add = ns();
    uint64_t diff_add = (end_time_add - start_time_add) / N;
    printf("10.000.000 iterations of addloop: %" PRIu64 "ns\n", diff_add);
    printf("10.000.000 iterations of addloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_add - diff_for);
    uint64_t start_time_mul = ns();
    for (int i = 0; i < N; i++)
        count_mulloop();
    uint64_t end_time_mul = ns();
    uint64_t diff_mul = (end_time_mul - start_time_mul) / N;
    printf("10.000.000 iterations of mulloop: %" PRIu64 "ns\n", diff_mul);
    printf("10.000.000 iterations of mulloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_mul - diff_for);
    uint64_t start_time_div = ns();
    for (int i = 0; i < N; i++) {
        count_divloop();
    }
    uint64_t end_time_div = ns();
    uint64_t diff_div = (end_time_div - start_time_div) / N;
    printf("10.000.000 iterations of divloop: %" PRIu64 "ns\n", diff_div);
    printf("10.000.000 iterations of divloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_div - diff_for);
    double real_add = (diff_add - diff_for) / 10000000.0;
    double real_mul = (diff_mul - diff_for) / 10000000.0;
    double real_div = (diff_div - diff_for) / 10000000.0;
    printf ("Add: %lfns\n", real_add);
    printf ("Mul: %lfns\n", real_mul);
    printf ("Div: %lfns\n", real_div);
    printf ("Mul/Add = %lf\n", real_mul / real_add);
    printf ("Div/Add = %lf\n", real_div / real_add);
}

uint64_t ns(void) {
    struct timespec t;
    clock_gettime(CLOCK_REALTIME, &t);
    return (uint64_t)(t.tv_sec) * (uint64_t)1000000000 + (uint64_t)(t.tv_nsec);
}

I wanted to benchmark/compare the length e.g. adding, multiplying and dividing takes.

10.000.000 iterations of empty forloop: 2415544ns
10.000.000 iterations of addloop: 3074961ns
10.000.000 iterations of addloop: 659417ns (Excluding for-loop)
10.000.000 iterations of mulloop: 7177428ns
10.000.000 iterations of mulloop: 4761884ns (Excluding for-loop)
10.000.000 iterations of divloop: 43092662ns
10.000.000 iterations of divloop: 40677118ns (Excluding for-loop)
Add: 0.065942ns # Here, this is weird
Mul: 0.476188ns
Div: 4.067712ns
Mul/Add = 7.221355
Div/Add = 61.686487

First I just benchmark the calls to count_forloop in order to get the performance of all instructions except the one I want to measure, then I measure the time for the different functions, subtract the time needed for an empty for-loop and then divide until I have the time for one execution of the instruction.

My processor is able to reach 4.2GHz, while running this program, this value is nearly reached on one core.

Let's assume, it is able to reach those 4.2GHz, this means, that each cycle takes 2.381e-10seconds, or around 0.24 nanoseconds.

But as you can see, the add instruction only takes 0.065942ns, so only around one third of one cycle.

I tried running the programming multiple times, but I always get the same result, that add is faster than the processor itself.

I can't find any error in my calculations.

分享到QQ

分享到微博