将 MMX/SSE 指令移植到 AltiVec

发布于 2024-10-06 14:32:21 字数 1516 浏览 1 评论 0原文

我在 ASM 方面的经验极其有限,在 SIMD 方面的经验就更少了。

但碰巧我有以下 MMX/SSE 优化代码,我想将其移植到 AltiVec 指令以在 PPC/Cell 处理器上使用。

这可能是一个很大的问题..尽管只有几行代码,但我在尝试弄清楚这里发生的事情时遇到了无穷无尽的麻烦。

原始函数:

static inline int convolve(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        __m64 m64;
        int i32[2];
    } tmp;
    tmp.i32[0] = 0;
    tmp.i32[1] = 0;
    while (n >= 4) {
        tmp.m64 = _mm_add_pi32(tmp.m64,
                               _mm_madd_pi16(*((__m64 *)a),
                                             *((__m64 *)b)));
        a += 4;
        b += 4;
        n -= 4;
    }
    out = tmp.i32[0] + tmp.i32[1];
    _mm_empty();

    while (n --)
        out += (*(a++)) * (*(b++));
    return out;
}

有关如何重写此函数以使用 AltiVec 指令的任何提示?

我的第一次尝试(一次非常错误的尝试)看起来像这样......但它并不完全(甚至远程)正确。

static inline int convolve_altivec(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        vector unsigned int m128;
        int i64[2];
    } tmp;

    vector unsigned int zero = {0, 0, 0, 0};

    tmp.i64[0] = 0;
    tmp.i64[1] = 0;
    while (n >= 8) {
        tmp.m128 = vec_add(tmp.m128,
                               vec_msum(*((vector unsigned short *)a),
                                             *((vector unsigned short *)b), zero));

        a += 8;
        b += 8;
        n -= 8;
    }
    out = tmp.i64[0] + tmp.i64[1];
#endif
    while (n --)
        out += (*(a++)) * (*(b++));
    return out;
}

Let me preface this with.. I have extremely limited experience with ASM, and even less with SIMD.

But it happens that I have the following MMX/SSE optimised code, that I would like to port across to AltiVec instructions for use on PPC/Cell processors.

This is probably a big ask.. Even though it's only a few lines of code, I've had no end of trouble trying to work out what's going on here.

The original function:

static inline int convolve(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        __m64 m64;
        int i32[2];
    } tmp;
    tmp.i32[0] = 0;
    tmp.i32[1] = 0;
    while (n >= 4) {
        tmp.m64 = _mm_add_pi32(tmp.m64,
                               _mm_madd_pi16(*((__m64 *)a),
                                             *((__m64 *)b)));
        a += 4;
        b += 4;
        n -= 4;
    }
    out = tmp.i32[0] + tmp.i32[1];
    _mm_empty();

    while (n --)
        out += (*(a++)) * (*(b++));
    return out;
}

Any tips on how I might rewrite this to use AltiVec instructions?

My first attempt (a very wrong attempt) looks something like this.. But it's not entirely (or even remotely) correct.

static inline int convolve_altivec(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        vector unsigned int m128;
        int i64[2];
    } tmp;

    vector unsigned int zero = {0, 0, 0, 0};

    tmp.i64[0] = 0;
    tmp.i64[1] = 0;
    while (n >= 8) {
        tmp.m128 = vec_add(tmp.m128,
                               vec_msum(*((vector unsigned short *)a),
                                             *((vector unsigned short *)b), zero));

        a += 8;
        b += 8;
        n -= 8;
    }
    out = tmp.i64[0] + tmp.i64[1];
#endif
    while (n --)
        out += (*(a++)) * (*(b++));
    return out;
}

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(2

谁的年少不轻狂 2024-10-13 14:32:21

你已经离我不远了 - 我修复了一些小问题,稍微清理了代码,添加了一个测试工具,现在看起来工作正常:

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <altivec.h>

static int convolve_ref(const short *a, const short *b, int n)
{
    int out = 0;
    int i;

    for (i = 0; i < n; ++i)
    {
        out += a[i] * b[i];
    }

    return out;
}

static inline int convolve_altivec(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        vector signed int m128;
        int i32[4];
    } tmp;

    const vector signed int zero = {0, 0, 0, 0};

    assert(((unsigned long)a & 15) == 0);
    assert(((unsigned long)b & 15) == 0);

    tmp.m128 = zero;

    while (n >= 8)
    {
        tmp.m128 = vec_msum(*((vector signed short *)a),
                            *((vector signed short *)b), tmp.m128);

        a += 8;
        b += 8;
        n -= 8;
    }

    out = tmp.i32[0] + tmp.i32[1] + tmp.i32[2] + tmp.i32[3];

    while (n --)
        out += (*(a++)) * (*(b++));

    return out;
}

int main(void)
{
    const int n = 100;

    vector signed short _a[n / 8 + 1];
    vector signed short _b[n / 8 + 1];

    short *a = (short *)_a;
    short *b = (short *)_b;

    int sum_ref, sum_test;

    int i;

    for (i = 0; i < n; ++i)
    {
        a[i] = rand();
        b[i] = rand();
    }

    sum_ref = convolve_ref(a, b, n);
    sum_test = convolve_altivec(a, b, n);

    printf("sum_ref = %d\n", sum_ref);
    printf("sum_test = %d\n", sum_test);

    printf("%s\n", sum_ref == sum_test ? "PASS" : "FAIL");

    return 0;
}

You're not far off - I fixed a few minor problems, cleaned up the code a little, added a test harness, and it seems to work OK now:

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <altivec.h>

static int convolve_ref(const short *a, const short *b, int n)
{
    int out = 0;
    int i;

    for (i = 0; i < n; ++i)
    {
        out += a[i] * b[i];
    }

    return out;
}

static inline int convolve_altivec(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        vector signed int m128;
        int i32[4];
    } tmp;

    const vector signed int zero = {0, 0, 0, 0};

    assert(((unsigned long)a & 15) == 0);
    assert(((unsigned long)b & 15) == 0);

    tmp.m128 = zero;

    while (n >= 8)
    {
        tmp.m128 = vec_msum(*((vector signed short *)a),
                            *((vector signed short *)b), tmp.m128);

        a += 8;
        b += 8;
        n -= 8;
    }

    out = tmp.i32[0] + tmp.i32[1] + tmp.i32[2] + tmp.i32[3];

    while (n --)
        out += (*(a++)) * (*(b++));

    return out;
}

int main(void)
{
    const int n = 100;

    vector signed short _a[n / 8 + 1];
    vector signed short _b[n / 8 + 1];

    short *a = (short *)_a;
    short *b = (short *)_b;

    int sum_ref, sum_test;

    int i;

    for (i = 0; i < n; ++i)
    {
        a[i] = rand();
        b[i] = rand();
    }

    sum_ref = convolve_ref(a, b, n);
    sum_test = convolve_altivec(a, b, n);

    printf("sum_ref = %d\n", sum_ref);
    printf("sum_test = %d\n", sum_test);

    printf("%s\n", sum_ref == sum_test ? "PASS" : "FAIL");

    return 0;
}
却一份温柔 2024-10-13 14:32:21

(警告:我所有的 Altivec 经验都来自在 Xbox360/PS3 上的工作 - 我不确定它们与其他 Altivec 平台有多么不同)。

首先,您应该检查指针对齐情况。大多数向量加载(和存储)操作预计来自 16 字节对齐的地址。如果不是,事情通常会在没有警告的情况下继续进行,但您将无法获得预期的数据。

可以(但速度较慢)执行未对齐的加载,但您基本上必须在数据之前和之后阅读一些内容并将它们组合起来。请参阅 Apple 的 Altivec 页面。我也在使用 lvlxlvrx 加载指令之前完成了此操作,然后将它们组合在一起。


接下来,我不确定你的乘法和加法是否相同。我从未使用过 _mm_madd_pi16 或 vec_msum,所以我不确定它们是等效的。您应该在调试器中单步执行,并确保它们为相同的输入数据提供相同的输出。另一个可能的区别是它们可能以不同的方式处理溢出(例如模块化与饱和)。


最后但并非最不重要的一点是,您一次计算 4 个整数,而不是 2 个。因此,您的并集应该保存 4 个整数,并且您应该在最后将所有 4 个整数相加。

(Warning: all of my Altivec experience comes from working on Xbox360/PS3 - I'm not sure how different they are from other Altivec platforms).

First off, you should check your pointer alignment. Most vector loads (and stores) operations are expected to be from 16-byte aligned addresses. If they aren't, things will usually carry on without warning, but you won't get the data you were expecting.

It's possible (but slower) to do unaligned loads, but you basically have to read a bit before and after your data and combine them. See Apple's Altivec page. I've also done it before using an lvlx and lvrx load instructions, and then ORing them together.


Next up, I'm not sure your multiplies and adds are the same. I've never used either _mm_madd_pi16 or vec_msum, so I'm not positive they're equivalent. You should step through in a debugger and make sure they give you the same output for the same input data. Another possible difference is that they may treat overflow differently (e.g. modular vs. saturate).


Last but not least, you're computing 4 ints at a time instead of 2. So your union should hold 4 ints, and you should sum all 4 of them at the end.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文