附加指令比处理器本身快
我有此代码(添加了一些指令以实现基准公平性):
.global count_forloop
.global count_addloop
.global count_mulloop
.global count_divloop
count_forloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %eax, %eax
_count_forloop1:
inc %rax
cmp $10000000, %rax
jne _count_forloop1
pop %rcx
pop %rsi
ret
count_addloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %eax, %eax
xor %ecx, %ecx
_count_addloop1:
inc %rax
add $3, %rcx # Benchmark this instruction
cmp $10000000, %rax
jne _count_addloop1
pop %rcx
pop %rsi
ret
count_mulloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %eax, %eax
xor %ecx, %ecx
add $4, %ecx
_count_mulloop1:
inc %rax
imul $3, %rcx # Benchmark this instruction
cmp $10000000, %rax
jne _count_mulloop1
pop %rcx
pop %rsi
ret
count_divloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %ecx, %ecx
add $1, %rcx
_count_divloop1:
inc %rcx
div %rsi # Benchmark this instruction
cmp $10000000, %rcx
jne _count_divloop1
pop %rcx
pop %rsi
ret
我
#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
#define N 1000
void count_forloop(void);
void count_addloop(void);
void count_mulloop(void);
void count_divloop(void);
uint64_t ns(void);
int main(int argc, char** argv) {
uint64_t start_time_for = ns();
for (int i = 0; i < N; i++)
count_forloop();
uint64_t end_time_for = ns();
uint64_t diff_for = (end_time_for - start_time_for) / N;
printf("10.000.000 iterations of empty forloop: %" PRIu64 "ns\n", diff_for);
uint64_t start_time_add = ns();
for (int i = 0; i < N; i++)
count_addloop();
uint64_t end_time_add = ns();
uint64_t diff_add = (end_time_add - start_time_add) / N;
printf("10.000.000 iterations of addloop: %" PRIu64 "ns\n", diff_add);
printf("10.000.000 iterations of addloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_add - diff_for);
uint64_t start_time_mul = ns();
for (int i = 0; i < N; i++)
count_mulloop();
uint64_t end_time_mul = ns();
uint64_t diff_mul = (end_time_mul - start_time_mul) / N;
printf("10.000.000 iterations of mulloop: %" PRIu64 "ns\n", diff_mul);
printf("10.000.000 iterations of mulloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_mul - diff_for);
uint64_t start_time_div = ns();
for (int i = 0; i < N; i++) {
count_divloop();
}
uint64_t end_time_div = ns();
uint64_t diff_div = (end_time_div - start_time_div) / N;
printf("10.000.000 iterations of divloop: %" PRIu64 "ns\n", diff_div);
printf("10.000.000 iterations of divloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_div - diff_for);
double real_add = (diff_add - diff_for) / 10000000.0;
double real_mul = (diff_mul - diff_for) / 10000000.0;
double real_div = (diff_div - diff_for) / 10000000.0;
printf ("Add: %lfns\n", real_add);
printf ("Mul: %lfns\n", real_mul);
printf ("Div: %lfns\n", real_div);
printf ("Mul/Add = %lf\n", real_mul / real_add);
printf ("Div/Add = %lf\n", real_div / real_add);
}
uint64_t ns(void) {
struct timespec t;
clock_gettime(CLOCK_REALTIME, &t);
return (uint64_t)(t.tv_sec) * (uint64_t)1000000000 + (uint64_t)(t.tv_nsec);
}
想基准/比较长度,例如添加,乘法和分配。
10.000.000 iterations of empty forloop: 2415544ns
10.000.000 iterations of addloop: 3074961ns
10.000.000 iterations of addloop: 659417ns (Excluding for-loop)
10.000.000 iterations of mulloop: 7177428ns
10.000.000 iterations of mulloop: 4761884ns (Excluding for-loop)
10.000.000 iterations of divloop: 43092662ns
10.000.000 iterations of divloop: 40677118ns (Excluding for-loop)
Add: 0.065942ns # Here, this is weird
Mul: 0.476188ns
Div: 4.067712ns
Mul/Add = 7.221355
Div/Add = 61.686487
首先,我只是基于count_forloop的调用,以便获得所有指令的性能,除了我要测量的指令,然后我测量了不同功能的时间,减去空的for-for-lor-for-lor-for-lor-for-lor-for-lor-livide,然后除去直到i有时间执行指令。
我的处理器能够达到4.2GHz,在运行此程序时,该值几乎可以在一个核心上达到。
假设,它能够达到这4.2GHz,这意味着,每个周期都采用2.381e-10
秒,或大约0.24纳秒左右。
但是,如您所见,添加
指令仅需要0.065942NS,因此仅占一个周期的三分之一。
我尝试多次运行编程,但是我总是得到相同的结果,add
比处理器本身快。
我在计算中找不到任何错误。
I have this code (Some instructions are added for benchmark fairness):
.global count_forloop
.global count_addloop
.global count_mulloop
.global count_divloop
count_forloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %eax, %eax
_count_forloop1:
inc %rax
cmp $10000000, %rax
jne _count_forloop1
pop %rcx
pop %rsi
ret
count_addloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %eax, %eax
xor %ecx, %ecx
_count_addloop1:
inc %rax
add $3, %rcx # Benchmark this instruction
cmp $10000000, %rax
jne _count_addloop1
pop %rcx
pop %rsi
ret
count_mulloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %eax, %eax
xor %ecx, %ecx
add $4, %ecx
_count_mulloop1:
inc %rax
imul $3, %rcx # Benchmark this instruction
cmp $10000000, %rax
jne _count_mulloop1
pop %rcx
pop %rsi
ret
count_divloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %ecx, %ecx
add $1, %rcx
_count_divloop1:
inc %rcx
div %rsi # Benchmark this instruction
cmp $10000000, %rcx
jne _count_divloop1
pop %rcx
pop %rsi
ret
and
#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
#define N 1000
void count_forloop(void);
void count_addloop(void);
void count_mulloop(void);
void count_divloop(void);
uint64_t ns(void);
int main(int argc, char** argv) {
uint64_t start_time_for = ns();
for (int i = 0; i < N; i++)
count_forloop();
uint64_t end_time_for = ns();
uint64_t diff_for = (end_time_for - start_time_for) / N;
printf("10.000.000 iterations of empty forloop: %" PRIu64 "ns\n", diff_for);
uint64_t start_time_add = ns();
for (int i = 0; i < N; i++)
count_addloop();
uint64_t end_time_add = ns();
uint64_t diff_add = (end_time_add - start_time_add) / N;
printf("10.000.000 iterations of addloop: %" PRIu64 "ns\n", diff_add);
printf("10.000.000 iterations of addloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_add - diff_for);
uint64_t start_time_mul = ns();
for (int i = 0; i < N; i++)
count_mulloop();
uint64_t end_time_mul = ns();
uint64_t diff_mul = (end_time_mul - start_time_mul) / N;
printf("10.000.000 iterations of mulloop: %" PRIu64 "ns\n", diff_mul);
printf("10.000.000 iterations of mulloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_mul - diff_for);
uint64_t start_time_div = ns();
for (int i = 0; i < N; i++) {
count_divloop();
}
uint64_t end_time_div = ns();
uint64_t diff_div = (end_time_div - start_time_div) / N;
printf("10.000.000 iterations of divloop: %" PRIu64 "ns\n", diff_div);
printf("10.000.000 iterations of divloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_div - diff_for);
double real_add = (diff_add - diff_for) / 10000000.0;
double real_mul = (diff_mul - diff_for) / 10000000.0;
double real_div = (diff_div - diff_for) / 10000000.0;
printf ("Add: %lfns\n", real_add);
printf ("Mul: %lfns\n", real_mul);
printf ("Div: %lfns\n", real_div);
printf ("Mul/Add = %lf\n", real_mul / real_add);
printf ("Div/Add = %lf\n", real_div / real_add);
}
uint64_t ns(void) {
struct timespec t;
clock_gettime(CLOCK_REALTIME, &t);
return (uint64_t)(t.tv_sec) * (uint64_t)1000000000 + (uint64_t)(t.tv_nsec);
}
I wanted to benchmark/compare the length e.g. adding, multiplying and dividing takes.
10.000.000 iterations of empty forloop: 2415544ns
10.000.000 iterations of addloop: 3074961ns
10.000.000 iterations of addloop: 659417ns (Excluding for-loop)
10.000.000 iterations of mulloop: 7177428ns
10.000.000 iterations of mulloop: 4761884ns (Excluding for-loop)
10.000.000 iterations of divloop: 43092662ns
10.000.000 iterations of divloop: 40677118ns (Excluding for-loop)
Add: 0.065942ns # Here, this is weird
Mul: 0.476188ns
Div: 4.067712ns
Mul/Add = 7.221355
Div/Add = 61.686487
First I just benchmark the calls to count_forloop in order to get the performance of all instructions except the one I want to measure, then I measure the time for the different functions, subtract the time needed for an empty for-loop and then divide until I have the time for one execution of the instruction.
My processor is able to reach 4.2GHz, while running this program, this value is nearly reached on one core.
Let's assume, it is able to reach those 4.2GHz, this means, that each cycle takes 2.381e-10
seconds, or around 0.24 nanoseconds.
But as you can see, the add
instruction only takes 0.065942ns, so only around one third of one cycle.
I tried running the programming multiple times, but I always get the same result, that add
is faster than the processor itself.
I can't find any error in my calculations.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论