使用 sse 执行内在函数
我目前正在开始使用 SSE。 我之前关于 SSE 的问题的答案(使用 SSE 将向量乘以常量)让我想到测试使用 _mm_mul_ps()
这样的内部函数和仅使用 *
这样的“普通运算符”(不确定最好的术语是什么)之间的区别。
所以我写了两个测试用例,它们仅在计算结果的方式上有所不同:
方法 1:
int main(void){
float4 a, b, c;
a.v = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
b.v = _mm_set_ps(-1.0f, -2.0f, -3.0f, -4.0f);
printf("method 1\n");
c.v = a.v + b.v; // <---
print_vector(a);
print_vector(b);
printf("1.a) Computed output 1: ");
print_vector(c);
exit(EXIT_SUCCESS);
}
方法 2:
int main(void){
float4 a, b, c;
a.v = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
b.v = _mm_set_ps(-1.0f, -2.0f, -3.0f, -4.0f);
printf("\nmethod 2\n");
c.v = _mm_add_ps(a.v, b.v); // <---
print_vector(a);
print_vector(b);
printf("1.b) Computed output 2: ");
print_vector(c);
exit(EXIT_SUCCESS);
}
两个测试用例共享以下内容:
typedef union float4{
__m128 v;
float x,y,z,w;
} float4;
void print_vector (float4 v){
printf("%f,%f,%f,%f\n", v.x, v.y, v.z, v.w);
}
因此,要比较我编译的两种情况生成的代码:gcc -ggdb -msse -c t_vectorExtensions_method1.c
结果(仅显示两个向量相加的部分 - 不同):
方法一:
c.v = a.v + b.v;
a1: 0f 57 c9 xorps %xmm1,%xmm1
a4: 0f 12 4d d0 movlps -0x30(%rbp),%xmm1
a8: 0f 16 4d d8 movhps -0x28(%rbp),%xmm1
ac: 0f 57 c0 xorps %xmm0,%xmm0
af: 0f 12 45 c0 movlps -0x40(%rbp),%xmm0
b3: 0f 16 45 c8 movhps -0x38(%rbp),%xmm0
b7: 0f 58 c1 addps %xmm1,%xmm0
ba: 0f 13 45 b0 movlps %xmm0,-0x50(%rbp)
be: 0f 17 45 b8 movhps %xmm0,-0x48(%rbp)
方法二:
c.v = _mm_add_ps(a.v, b.v);
a1: 0f 57 c0 xorps %xmm0,%xmm0
a4: 0f 12 45 a0 movlps -0x60(%rbp),%xmm0
a8: 0f 16 45 a8 movhps -0x58(%rbp),%xmm0
ac: 0f 57 c9 xorps %xmm1,%xmm1
af: 0f 12 4d b0 movlps -0x50(%rbp),%xmm1
b3: 0f 16 4d b8 movhps -0x48(%rbp),%xmm1
b7: 0f 13 4d f0 movlps %xmm1,-0x10(%rbp)
bb: 0f 17 4d f8 movhps %xmm1,-0x8(%rbp)
bf: 0f 13 45 e0 movlps %xmm0,-0x20(%rbp)
c3: 0f 17 45 e8 movhps %xmm0,-0x18(%rbp)
/* Perform the respective operation on the four SPFP values in A and B. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ps (__m128 __A, __m128 __B)
{
return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
c7: 0f 57 c0 xorps %xmm0,%xmm0
ca: 0f 12 45 e0 movlps -0x20(%rbp),%xmm0
ce: 0f 16 45 e8 movhps -0x18(%rbp),%xmm0
d2: 0f 57 c9 xorps %xmm1,%xmm1
d5: 0f 12 4d f0 movlps -0x10(%rbp),%xmm1
d9: 0f 16 4d f8 movhps -0x8(%rbp),%xmm1
dd: 0f 58 c1 addps %xmm1,%xmm0
e0: 0f 13 45 90 movlps %xmm0,-0x70(%rbp)
e4: 0f 17 45 98 movhps %xmm0,-0x68(%rbp)
显然使用内部_mm_add_ps()
生成的代码要大得多。这是为什么呢?难道不应该产生更好的代码吗?
I am currently getting started with SSE.
The answer to my previous question regarding SSE ( Mutiplying vector by constant using SSE ) brought me to the idea to test the difference between using intrinsics like _mm_mul_ps()
and just using 'normal operators' (not sure what the best term is) like *
.
So i wrote two testing cases which only differ in way the result is calculated:
Method 1:
int main(void){
float4 a, b, c;
a.v = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
b.v = _mm_set_ps(-1.0f, -2.0f, -3.0f, -4.0f);
printf("method 1\n");
c.v = a.v + b.v; // <---
print_vector(a);
print_vector(b);
printf("1.a) Computed output 1: ");
print_vector(c);
exit(EXIT_SUCCESS);
}
Method 2:
int main(void){
float4 a, b, c;
a.v = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
b.v = _mm_set_ps(-1.0f, -2.0f, -3.0f, -4.0f);
printf("\nmethod 2\n");
c.v = _mm_add_ps(a.v, b.v); // <---
print_vector(a);
print_vector(b);
printf("1.b) Computed output 2: ");
print_vector(c);
exit(EXIT_SUCCESS);
}
both testing cases share the following:
typedef union float4{
__m128 v;
float x,y,z,w;
} float4;
void print_vector (float4 v){
printf("%f,%f,%f,%f\n", v.x, v.y, v.z, v.w);
}
So to compare the code generated for both cases i compiled using:gcc -ggdb -msse -c t_vectorExtensions_method1.c
Which resulted in (showing only the part where the two vectors are added -which differs):
Method 1:
c.v = a.v + b.v;
a1: 0f 57 c9 xorps %xmm1,%xmm1
a4: 0f 12 4d d0 movlps -0x30(%rbp),%xmm1
a8: 0f 16 4d d8 movhps -0x28(%rbp),%xmm1
ac: 0f 57 c0 xorps %xmm0,%xmm0
af: 0f 12 45 c0 movlps -0x40(%rbp),%xmm0
b3: 0f 16 45 c8 movhps -0x38(%rbp),%xmm0
b7: 0f 58 c1 addps %xmm1,%xmm0
ba: 0f 13 45 b0 movlps %xmm0,-0x50(%rbp)
be: 0f 17 45 b8 movhps %xmm0,-0x48(%rbp)
Method 2:
c.v = _mm_add_ps(a.v, b.v);
a1: 0f 57 c0 xorps %xmm0,%xmm0
a4: 0f 12 45 a0 movlps -0x60(%rbp),%xmm0
a8: 0f 16 45 a8 movhps -0x58(%rbp),%xmm0
ac: 0f 57 c9 xorps %xmm1,%xmm1
af: 0f 12 4d b0 movlps -0x50(%rbp),%xmm1
b3: 0f 16 4d b8 movhps -0x48(%rbp),%xmm1
b7: 0f 13 4d f0 movlps %xmm1,-0x10(%rbp)
bb: 0f 17 4d f8 movhps %xmm1,-0x8(%rbp)
bf: 0f 13 45 e0 movlps %xmm0,-0x20(%rbp)
c3: 0f 17 45 e8 movhps %xmm0,-0x18(%rbp)
/* Perform the respective operation on the four SPFP values in A and B. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ps (__m128 __A, __m128 __B)
{
return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
c7: 0f 57 c0 xorps %xmm0,%xmm0
ca: 0f 12 45 e0 movlps -0x20(%rbp),%xmm0
ce: 0f 16 45 e8 movhps -0x18(%rbp),%xmm0
d2: 0f 57 c9 xorps %xmm1,%xmm1
d5: 0f 12 4d f0 movlps -0x10(%rbp),%xmm1
d9: 0f 16 4d f8 movhps -0x8(%rbp),%xmm1
dd: 0f 58 c1 addps %xmm1,%xmm0
e0: 0f 13 45 90 movlps %xmm0,-0x70(%rbp)
e4: 0f 17 45 98 movhps %xmm0,-0x68(%rbp)
Obviously the code generated when using the intrinsic _mm_add_ps()
is much larger. Why is this? Shouldn't it result in better code?
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
真正重要的是
addps
。在更实际的用例中,您可能会在循环中添加两个大的浮点数向量,循环体将只包含 addps、两个加载和一个存储以及一些标量用于地址运算的整数指令。在现代超标量 CPU 上,许多指令将并行执行。另请注意,您在禁用优化的情况下进行编译,因此您不会获得特别有效的代码。尝试 gcc -O3 -msse3 ... 。
All that really matters is the
addps
. In a more realistic use case, where you might be, say, adding two large vectors of floats in a loop, the body of the loop will just containaddps
, two loads and a store, and some scalar integer instructions for address arithmetic. On a modern superscalar CPU many of these instructions will execute in parallel.Note also that you're compiling with optimisation disabled, so you won't get particularly efficient code. Try
gcc -O3 -msse3 ...
.