使用自动矢量化和 sse 加速对数据大小的依赖
我正在尝试使用英特尔编译器的自动矢量化和 sse 来加速一些代码。 所有计算都是将某些结构体node_t转换为另一个结构体w_t(函数tr()和gen_tr())。 当我尝试向量化函数 gen_tr() 时,它不会产生任何效果。
如果更改数据存储格式,当每个结构体组件存储在不同的浮点数组中时,自动矢量化效果很好,请参阅函数genv_tr()。
使用 sse 的函数称为 ssev_tr(N 应除以 4)。
transform.c:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
static __inline__ unsigned long getCC(void)
{
unsigned a, d;
asm volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long)a) | (((unsigned long)d) << 32);
}
typedef struct {
float x1, x2, x3, x4, x5;
} node_t;
typedef struct {
float w1, w2, w3, w4;
} w_t;
void tr(node_t *n, float c1, float c2, w_t *w)
{
const float nv = n->x1;
const float N00T = n->x3 * c1;
const float n1v = n->x2;
const float N01T = n->x4 * c2;
w->w1 = nv - N00T;
w->w2 = nv + N00T;
w->w3 = n1v - N01T;
w->w4 = n1v + N01T;
}
__attribute__ ((noinline))
void gen_tr(node_t *n, w_t *w, const int N, float c1, float c2)
{
int i;
#pragma vector aligned
#pragma ivdep
for (i = 0; i < N; i++) {
tr(n + i, c1, c2, w + i);
}
}
__attribute__ ((noinline))
void genv_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
int i;
#pragma vector aligned
#pragma ivdep
for (i = 0; i < N; i++) {
const float N00T = x3[i] * c1;
const float N01T = x4[i] * c2;
w1[i] = x1[i] - N00T;
w2[i] = x1[i] + N00T;
w3[i] = x2[i] - N01T;
w4[i] = x2[i] + N01T;
}
}
__attribute__ ((noinline))
void ssev_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
__m128 *ws1 = (__m128*)w1;
__m128 *ws2 = (__m128*)w2;
__m128 *ws3 = (__m128*)w3;
__m128 *ws4 = (__m128*)w4;
__m128 *xs1 = (__m128*)x1;
__m128 *xs2 = (__m128*)x2;
__m128 *xs3 = (__m128*)x3;
__m128 *xs4 = (__m128*)x4;
const __m128 cs1 = _mm_set1_ps(c1);
const __m128 cs2 = _mm_set1_ps(c2);
int i;
#pragma vector aligned
#pragma ivdep
for (i = 0; i < N / 4; i++) {
const __m128 N00T = _mm_mul_ps(xs3[i], cs1);
const __m128 N01T = _mm_mul_ps(xs4[i], cs2);
ws1[i] = _mm_sub_ps(xs1[i], N00T);
ws2[i] = _mm_add_ps(xs1[i], N00T);
ws3[i] = _mm_sub_ps(xs2[i], N01T);
ws4[i] = _mm_add_ps(xs2[i], N01T);
}
}
#define test(func) \
for (i = 0; i < n; i++) { \
x[i].x1 = 1.0; \
x[i].x2 = 2.0; \
x[i].x3 = 2.0; \
x[i].x4 = 2.0; \
x[i].x5 = 2.0; \
} \
\
t1 = getCC(); \
for (i = 0; i < rep; i++) { \
func(x, w, n, c1, c2); \
} \
t2 = getCC(); \
printf("\t%f", ((double)(t2 - t1)) / n / rep);
#define test1(func) \
for (i = 0; i < n; i++) { \
x1[i] = 1.0; \
x2[i] = 2.0; \
x3[i] = 2.0; \
x4[i] = 2.0; \
x5[i] = 2.0; \
} \
\
t1 = getCC(); \
for (i = 0; i < rep; i++) { \
func(x1, x2, x3, x4, x5, w1, w2, w3, w4, n, c1, c2); \
} \
t2 = getCC(); \
printf("\t%f", ((double)(t2 - t1)) / n / rep);
int main(int argc, char *argv[])
{
if (argc < 2) {
printf("Usage %s vector_size\n", argv[0]);
}
int n = atoi(argv[1]);
printf("%d", n);
int rep = 100000000 / n;
int i;
int inc = 1;
float c1 = 2.0, c2 = 1.0;
unsigned long t1, t2;
node_t *x = (node_t*)malloc(n * sizeof(node_t));
w_t *w = (w_t*)malloc(n * sizeof(w_t));
float *x1 = (float*)malloc(n * sizeof(float));
float *x2 = (float*)malloc(n * sizeof(float));
float *x3 = (float*)malloc(n * sizeof(float));
float *x4 = (float*)malloc(n * sizeof(float));
float *x5 = (float*)malloc(n * sizeof(float));
float *w1 = (float*)malloc(n * sizeof(float));
float *w2 = (float*)malloc(n * sizeof(float));
float *w3 = (float*)malloc(n * sizeof(float));
float *w4 = (float*)malloc(n * sizeof(float));
test(gen_tr);
test1(genv_tr);
test1(ssev_tr);
printf("\n");
return 0;
}
编译选项:icc -O3 -Wall -W -vec-report6 transform.c -o transform
icc 版本 - 12.1.2,操作系统 - Fedora 16 x86_64,CPU - Intel Core2 Quad CPU Q8200。
然后我使用步骤 64 以从 16 到 3000 的不同大小运行它,这里是脚本:
#!/bin/bash
echo "" > run.log
for ((c=16;c<3000;c+=64))
do
./transform $c | tee -a run.log
done
这里是这个脚本的一些工作结果(大小、gen_tr、genv_tr、ssev_tr),每个数组元素显示的所有时间:
16 7.710743 3.168577 3.253829
272 7.166493 1.983918 2.618569
528 7.121866 1.920195 2.567109
784 7.115007 1.899451 2.549645
1040 8.104026 2.481062 2.944317
1296 8.137537 5.105032 5.104614
1552 8.118534 5.068812 5.064211
1808 8.138309 5.077831 5.085015
2064 8.149699 5.107503 5.069958
2320 8.164556 5.080981 5.099313
2576 8.151524 5.086056 5.089294
2832 8.212946 5.061927 5.072261
为什么它有如此重大的变化使用向量化版本的函数时大小为 1000?是因为缓存未命中吗?是否可以在所有数据范围上保存相同的速度?
I'm trying to speed up some code using auto vectorization from Intel Compiler and using sse.
All computations are transformation some struct node_t to another struct w_t (functions tr() and gen_tr()).
When I try vectorize function gen_tr() it does not produce any effects.
If change data storage format, when each struct component stored in different array of floats, then auto vectorization works well, see function genv_tr().
Function that used sse called ssev_tr (N should divided evenly by 4).
transform.c:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
static __inline__ unsigned long getCC(void)
{
unsigned a, d;
asm volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long)a) | (((unsigned long)d) << 32);
}
typedef struct {
float x1, x2, x3, x4, x5;
} node_t;
typedef struct {
float w1, w2, w3, w4;
} w_t;
void tr(node_t *n, float c1, float c2, w_t *w)
{
const float nv = n->x1;
const float N00T = n->x3 * c1;
const float n1v = n->x2;
const float N01T = n->x4 * c2;
w->w1 = nv - N00T;
w->w2 = nv + N00T;
w->w3 = n1v - N01T;
w->w4 = n1v + N01T;
}
__attribute__ ((noinline))
void gen_tr(node_t *n, w_t *w, const int N, float c1, float c2)
{
int i;
#pragma vector aligned
#pragma ivdep
for (i = 0; i < N; i++) {
tr(n + i, c1, c2, w + i);
}
}
__attribute__ ((noinline))
void genv_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
int i;
#pragma vector aligned
#pragma ivdep
for (i = 0; i < N; i++) {
const float N00T = x3[i] * c1;
const float N01T = x4[i] * c2;
w1[i] = x1[i] - N00T;
w2[i] = x1[i] + N00T;
w3[i] = x2[i] - N01T;
w4[i] = x2[i] + N01T;
}
}
__attribute__ ((noinline))
void ssev_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
__m128 *ws1 = (__m128*)w1;
__m128 *ws2 = (__m128*)w2;
__m128 *ws3 = (__m128*)w3;
__m128 *ws4 = (__m128*)w4;
__m128 *xs1 = (__m128*)x1;
__m128 *xs2 = (__m128*)x2;
__m128 *xs3 = (__m128*)x3;
__m128 *xs4 = (__m128*)x4;
const __m128 cs1 = _mm_set1_ps(c1);
const __m128 cs2 = _mm_set1_ps(c2);
int i;
#pragma vector aligned
#pragma ivdep
for (i = 0; i < N / 4; i++) {
const __m128 N00T = _mm_mul_ps(xs3[i], cs1);
const __m128 N01T = _mm_mul_ps(xs4[i], cs2);
ws1[i] = _mm_sub_ps(xs1[i], N00T);
ws2[i] = _mm_add_ps(xs1[i], N00T);
ws3[i] = _mm_sub_ps(xs2[i], N01T);
ws4[i] = _mm_add_ps(xs2[i], N01T);
}
}
#define test(func) \
for (i = 0; i < n; i++) { \
x[i].x1 = 1.0; \
x[i].x2 = 2.0; \
x[i].x3 = 2.0; \
x[i].x4 = 2.0; \
x[i].x5 = 2.0; \
} \
\
t1 = getCC(); \
for (i = 0; i < rep; i++) { \
func(x, w, n, c1, c2); \
} \
t2 = getCC(); \
printf("\t%f", ((double)(t2 - t1)) / n / rep);
#define test1(func) \
for (i = 0; i < n; i++) { \
x1[i] = 1.0; \
x2[i] = 2.0; \
x3[i] = 2.0; \
x4[i] = 2.0; \
x5[i] = 2.0; \
} \
\
t1 = getCC(); \
for (i = 0; i < rep; i++) { \
func(x1, x2, x3, x4, x5, w1, w2, w3, w4, n, c1, c2); \
} \
t2 = getCC(); \
printf("\t%f", ((double)(t2 - t1)) / n / rep);
int main(int argc, char *argv[])
{
if (argc < 2) {
printf("Usage %s vector_size\n", argv[0]);
}
int n = atoi(argv[1]);
printf("%d", n);
int rep = 100000000 / n;
int i;
int inc = 1;
float c1 = 2.0, c2 = 1.0;
unsigned long t1, t2;
node_t *x = (node_t*)malloc(n * sizeof(node_t));
w_t *w = (w_t*)malloc(n * sizeof(w_t));
float *x1 = (float*)malloc(n * sizeof(float));
float *x2 = (float*)malloc(n * sizeof(float));
float *x3 = (float*)malloc(n * sizeof(float));
float *x4 = (float*)malloc(n * sizeof(float));
float *x5 = (float*)malloc(n * sizeof(float));
float *w1 = (float*)malloc(n * sizeof(float));
float *w2 = (float*)malloc(n * sizeof(float));
float *w3 = (float*)malloc(n * sizeof(float));
float *w4 = (float*)malloc(n * sizeof(float));
test(gen_tr);
test1(genv_tr);
test1(ssev_tr);
printf("\n");
return 0;
}
Compile options: icc -O3 -Wall -W -vec-report6 transform.c -o transform
Version of icc - 12.1.2, OS - Fedora 16 x86_64, CPU - Intel Core2 Quad CPU Q8200.
Then i run it with different size from 16 to 3000 with step 64, here script:
#!/bin/bash
echo "" > run.log
for ((c=16;c<3000;c+=64))
do
./transform $c | tee -a run.log
done
Here some result of work this script (size, gen_tr, genv_tr, ssev_tr), all times shown per one array element:
16 7.710743 3.168577 3.253829
272 7.166493 1.983918 2.618569
528 7.121866 1.920195 2.567109
784 7.115007 1.899451 2.549645
1040 8.104026 2.481062 2.944317
1296 8.137537 5.105032 5.104614
1552 8.118534 5.068812 5.064211
1808 8.138309 5.077831 5.085015
2064 8.149699 5.107503 5.069958
2320 8.164556 5.080981 5.099313
2576 8.151524 5.086056 5.089294
2832 8.212946 5.061927 5.072261
why it is so significant change about size 1000 when using vectorized version of function? Does it because of cache miss? Is it possible to save same speed on all data ranges?
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

你有 8 个浮点数组。当它们的大小为 1000 时,您的测试将处理大约 32kB 的数据。尽管您的 L1 缓存可能稍大一些 (64kB),但由于关联性,L1 缓存可能无法同时保存所有 32kB 数据。
您的测试会迭代,一遍又一遍地处理相同的数据。考虑两种情况:
因此,输入大小 1000 处的跳转部分是测试的产物,但并非完全如此。在现实世界中,如果您碰巧在 L1 缓存中拥有所需的所有数据,genv_tr 将非常快。但对于大小 > 1000 的输入,所有输入根本无法放入 L1 缓存,因此某些访问肯定会进入 L2。
You have 8 float arrays. When they are of size 1000, your test is manipulating about 32kB of data. Even though your L1 cache is probably a bit larger (64kB), the L1 cache is likely unable to hold all the 32kB data at the same time due to associativity.
Your test iterates, processing the same data over and over again. Consider the two cases:
So the jump at input size 1000 is partly an artifact of your test, but not entirely. In the real world, if you already happen to have all the data you need in the L1 cache, genv_tr will be really fast. But on inputs of size >1000, all the inputs simply do not fit into the L1 cache, so some accesses will definitely go to L2.