
发布于 2025-01-26 08:27:42 字数 9699 浏览 2 评论 0原文

最近,我正在阅读一些开放源项目的一些ARM内联装配代码。 我发现以下代码之类的东西:

// arm32
asm volatile(
    // ...
    "vmla.f32   q6, q12, %q16           \n"
    // ...
// arm64
asm volatile(
    // ...
    "fmla  v16.4s, v22.4s, %16.s[0]     \n"
    // ...

在%操作数之后,我找不到任何文档引入这些字母。 在ARM32中,有%e%f%Q,而我仅发现“ S” ARM64。 那么,这些字母是什么意思?

编译器很叮当。 以及以下相对完整的代码:

// ARM32 codes
    float* outptr0 = out0;
    float* outptr1 = out1;
    float* outptr2 = out2;
    float* outptr3 = out3;

    const float* img0 = bottom_blob.channel(q);
    const float* img1 = bottom_blob.channel(q + 1);
    const float* img2 = bottom_blob.channel(q + 2);
    const float* img3 = bottom_blob.channel(q + 3);

    const float* kernel0 = kernel + p * inch + q;
    const float* kernel1 = kernel + (p + 1) * inch + q;
    const float* kernel2 = kernel + (p + 2) * inch + q;
    const float* kernel3 = kernel + (p + 3) * inch + q;

    const float* r0 = img0;
    const float* r1 = img1;
    const float* r2 = img2;
    const float* r3 = img3;

    int size = outw * outh;

    int nn = size >> 3;
    int remain = size & 7;

    float32x4_t _k0 = vld1q_f32(kernel0);
    float32x4_t _k1 = vld1q_f32(kernel1);
    float32x4_t _k2 = vld1q_f32(kernel2);
    float32x4_t _k3 = vld1q_f32(kernel3);

    if (nn > 0)
        asm volatile(
            "pld        [%5, #256]              \n"
            "vld1.f32   {d12-d15}, [%5 :128]!   \n"
            "pld        [%1, #256]              \n"
            "vld1.f32   {d16-d19}, [%1 :128]    \n"
            "0:                                 \n"

            "vmla.f32   q8, q6, %e18[0]         \n"

            "pld        [%2, #256]              \n"
            "vld1.f32   {d20-d23}, [%2 :128]    \n"
            "vmla.f32   q9, q7, %e18[0]         \n"

            "vmla.f32   q10, q6, %e19[0]        \n"

            "pld        [%3, #256]              \n"
            "vld1.f32   {d24-d27}, [%3 :128]    \n"
            "vmla.f32   q11, q7, %e19[0]        \n"

            "vmla.f32   q12, q6, %e20[0]        \n"

            "pld        [%4, #256]              \n"
            "vld1.f32   {d28-d31}, [%4 :128]    \n"
            "vmla.f32   q13, q7, %e20[0]        \n"

            "pld        [%6, #256]              \n"
            "vld1.f32   {d8-d11}, [%6 :128]!    \n"

            "vmla.f32   q14, q6, %e21[0]        \n"
            "vmla.f32   q15, q7, %e21[0]        \n"

            "vmla.f32   q8, q4, %e18[1]         \n"
            "vmla.f32   q9, q5, %e18[1]         \n"

            "vmla.f32   q10, q4, %e19[1]        \n"
            "vmla.f32   q11, q5, %e19[1]        \n"

            "vmla.f32   q12, q4, %e20[1]        \n"
            "vmla.f32   q13, q5, %e20[1]        \n"

            "pld        [%7, #256]              \n"
            "vld1.f32   {d12-d15}, [%7 :128]!   \n"

            "vmla.f32   q14, q4, %e21[1]        \n"
            "vmla.f32   q15, q5, %e21[1]        \n"

            "vmla.f32   q8, q6, %f18[0]         \n"
            "vmla.f32   q9, q7, %f18[0]         \n"

            "vmla.f32   q10, q6, %f19[0]        \n"
            "vmla.f32   q11, q7, %f19[0]        \n"

            "vmla.f32   q12, q6, %f20[0]        \n"
            "vmla.f32   q13, q7, %f20[0]        \n"

            "pld        [%8, #256]              \n"
            "vld1.f32   {d8-d11}, [%8 :128]!    \n"

            "vmla.f32   q14, q6, %f21[0]        \n"
            "vmla.f32   q15, q7, %f21[0]        \n"

            "vmla.f32   q8, q4, %f18[1]         \n"
            "vmla.f32   q9, q5, %f18[1]         \n"

            "vmla.f32   q10, q4, %f19[1]        \n"
            "vmla.f32   q11, q5, %f19[1]        \n"

            "vmla.f32   q12, q4, %f20[1]        \n"
            "vst1.f32   {d16-d19}, [%1 :128]!   \n"

            "vmla.f32   q13, q5, %f20[1]        \n"

            "vst1.f32   {d20-d23}, [%2 :128]!   \n"

            "vmla.f32   q14, q4, %f21[1]        \n"
            "pld        [%5, #256]              \n"
            "vld1.f32   {d12-d15}, [%5 :128]!   \n"

            "vmla.f32   q15, q5, %f21[1]        \n"

            "vst1.f32   {d24-d27}, [%3 :128]!   \n"

            "pld        [%1, #256]              \n"
            "vld1.f32   {d16-d19}, [%1 :128]    \n"

            "subs       %0, #1                  \n"
            "vst1.f32   {d28-d31}, [%4 :128]!   \n"

            "bne        0b                      \n"
            "sub        %5, #32                 \n"
            : "=r"(nn),      // %0
            "=r"(outptr0), // %1
            "=r"(outptr1), // %2
            "=r"(outptr2), // %3
            "=r"(outptr3), // %4
            "=r"(r0),      // %5
            "=r"(r1),      // %6
            "=r"(r2),      // %7
            "=r"(r3)       // %8
            : "0"(nn),
            "w"(_k0), // %18
            "w"(_k1), // %19
            "w"(_k2), // %20
            "w"(_k3)  // %21
            : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");

// ARM64 codes

float* outptr0 = out0;
float* outptr1 = out1;
float* outptr2 = out2;
float* outptr3 = out3;

const float* img0 = bottom_blob.channel(q);
const float* img1 = bottom_blob.channel(q + 1);
const float* img2 = bottom_blob.channel(q + 2);
const float* img3 = bottom_blob.channel(q + 3);

const float* kernel0 = kernel + p * inch + q;
const float* kernel1 = kernel + (p + 1) * inch + q;
const float* kernel2 = kernel + (p + 2) * inch + q;
const float* kernel3 = kernel + (p + 3) * inch + q;

const float* r0 = img0;
const float* r1 = img1;
const float* r2 = img2;
const float* r3 = img3;

int size = outw * outh;

int nn = size >> 3;
int remain = size & 7;

float32x4_t _k0 = vld1q_f32(kernel0);
float32x4_t _k1 = vld1q_f32(kernel1);
float32x4_t _k2 = vld1q_f32(kernel2);
float32x4_t _k3 = vld1q_f32(kernel3);

if (nn > 0)
    asm volatile(
        "prfm   pldl1keep, [%5, #256]       \n"
        "ld1    {v6.4s, v7.4s}, [%5], #32   \n"

        "prfm   pldl1keep, [%1, #256]       \n"
        "ld1    {v8.4s, v9.4s}, [%1]        \n"

        "0:                                 \n"

        "fmla   v8.4s, v6.4s, %18.s[0]      \n"

        "prfm   pldl1keep, [%2, #256]       \n"
        "ld1    {v10.4s, v11.4s}, [%2]      \n"

        "fmla   v9.4s, v7.4s, %18.s[0]      \n"

        "fmla   v10.4s, v6.4s, %19.s[0]     \n"

        "prfm   pldl1keep, [%3, #256]       \n"
        "ld1    {v12.4s, v13.4s}, [%3]      \n"

        "fmla   v11.4s, v7.4s, %19.s[0]     \n"

        "fmla   v12.4s, v6.4s, %20.s[0]     \n"

        "prfm   pldl1keep, [%4, #256]       \n"
        "ld1    {v14.4s, v15.4s}, [%4]      \n"

        "fmla   v13.4s, v7.4s, %20.s[0]     \n"

        "prfm   pldl1keep, [%6, #256]       \n"
        "ld1    {v4.4s, v5.4s}, [%6], #32   \n"

        "fmla   v14.4s, v6.4s, %21.s[0]     \n"
        "fmla   v15.4s, v7.4s, %21.s[0]     \n"

        "fmla   v8.4s, v4.4s, %18.s[1]      \n"
        "fmla   v9.4s, v5.4s, %18.s[1]      \n"

        "fmla   v10.4s, v4.4s, %19.s[1]     \n"
        "fmla   v11.4s, v5.4s, %19.s[1]     \n"

        "fmla   v12.4s, v4.4s, %20.s[1]     \n"
        "fmla   v13.4s, v5.4s, %20.s[1]     \n"

        "prfm   pldl1keep, [%7, #256]       \n"
        "ld1    {v6.4s, v7.4s}, [%7], #32   \n"

        "fmla   v14.4s, v4.4s, %21.s[1]     \n"
        "fmla   v15.4s, v5.4s, %21.s[1]     \n"

        "fmla   v8.4s, v6.4s, %18.s[2]      \n"
        "fmla   v9.4s, v7.4s, %18.s[2]      \n"

        "fmla   v10.4s, v6.4s, %19.s[2]     \n"
        "fmla   v11.4s, v7.4s, %19.s[2]     \n"

        "fmla   v12.4s, v6.4s, %20.s[2]     \n"
        "fmla   v13.4s, v7.4s, %20.s[2]     \n"

        "prfm   pldl1keep, [%8, #256]       \n"
        "ld1    {v4.4s, v5.4s}, [%8], #32   \n"

        "fmla   v14.4s, v6.4s, %21.s[2]     \n"
        "fmla   v15.4s, v7.4s, %21.s[2]     \n"

        "fmla   v8.4s, v4.4s, %18.s[3]      \n"
        "fmla   v9.4s, v5.4s, %18.s[3]      \n"

        "fmla   v10.4s, v4.4s, %19.s[3]     \n"
        "fmla   v11.4s, v5.4s, %19.s[3]     \n"

        "st1    {v8.4s, v9.4s}, [%1], #32   \n"

        "fmla   v12.4s, v4.4s, %20.s[3]     \n"
        "fmla   v13.4s, v5.4s, %20.s[3]     \n"

        "st1    {v10.4s, v11.4s}, [%2], #32 \n"

        "prfm   pldl1keep, [%5, #256]       \n"
        "ld1    {v6.4s, v7.4s}, [%5], #32   \n"

        "fmla   v14.4s, v4.4s, %21.s[3]     \n"
        "fmla   v15.4s, v5.4s, %21.s[3]     \n"

        "st1    {v12.4s, v13.4s}, [%3], #32 \n"

        "prfm   pldl1keep, [%1, #256]       \n"
        "ld1    {v8.4s, v9.4s}, [%1]        \n"

        "subs   %w0, %w0, #1                \n"

        "st1    {v14.4s, v15.4s}, [%4], #32 \n"

        "bne    0b                          \n"
        "sub    %5, %5, #32                 \n"
        : "=r"(nn),      // %0
        "=r"(outptr0), // %1
        "=r"(outptr1), // %2
        "=r"(outptr2), // %3
        "=r"(outptr3), // %4
        "=r"(r0),      // %5
        "=r"(r1),      // %6
        "=r"(r2),      // %7
        "=r"(r3)       // %8
        : "0"(nn),
        "w"(_k0), // %18
        "w"(_k1), // %19
        "w"(_k2), // %20
        "w"(_k3)  // %21
        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");


Recently, I'm reading some ARM inline assembly codes of some opensources projects.
I find something like below codes:

// arm32
asm volatile(
    // ...
    "vmla.f32   q6, q12, %q16           \n"
    // ...
// arm64
asm volatile(
    // ...
    "fmla  v16.4s, v22.4s, %16.s[0]     \n"
    // ...

I can't find any documents introduced these letters after % operands.
In arm32, there are %e %f %q, and arm64 I found "s" only.
So, what does these letters mean?

The compiler is clang.
And the relatily complete codes as below:

// ARM32 codes
    float* outptr0 = out0;
    float* outptr1 = out1;
    float* outptr2 = out2;
    float* outptr3 = out3;

    const float* img0 = bottom_blob.channel(q);
    const float* img1 = bottom_blob.channel(q + 1);
    const float* img2 = bottom_blob.channel(q + 2);
    const float* img3 = bottom_blob.channel(q + 3);

    const float* kernel0 = kernel + p * inch + q;
    const float* kernel1 = kernel + (p + 1) * inch + q;
    const float* kernel2 = kernel + (p + 2) * inch + q;
    const float* kernel3 = kernel + (p + 3) * inch + q;

    const float* r0 = img0;
    const float* r1 = img1;
    const float* r2 = img2;
    const float* r3 = img3;

    int size = outw * outh;

    int nn = size >> 3;
    int remain = size & 7;

    float32x4_t _k0 = vld1q_f32(kernel0);
    float32x4_t _k1 = vld1q_f32(kernel1);
    float32x4_t _k2 = vld1q_f32(kernel2);
    float32x4_t _k3 = vld1q_f32(kernel3);

    if (nn > 0)
        asm volatile(
            "pld        [%5, #256]              \n"
            "vld1.f32   {d12-d15}, [%5 :128]!   \n"
            "pld        [%1, #256]              \n"
            "vld1.f32   {d16-d19}, [%1 :128]    \n"
            "0:                                 \n"

            "vmla.f32   q8, q6, %e18[0]         \n"

            "pld        [%2, #256]              \n"
            "vld1.f32   {d20-d23}, [%2 :128]    \n"
            "vmla.f32   q9, q7, %e18[0]         \n"

            "vmla.f32   q10, q6, %e19[0]        \n"

            "pld        [%3, #256]              \n"
            "vld1.f32   {d24-d27}, [%3 :128]    \n"
            "vmla.f32   q11, q7, %e19[0]        \n"

            "vmla.f32   q12, q6, %e20[0]        \n"

            "pld        [%4, #256]              \n"
            "vld1.f32   {d28-d31}, [%4 :128]    \n"
            "vmla.f32   q13, q7, %e20[0]        \n"

            "pld        [%6, #256]              \n"
            "vld1.f32   {d8-d11}, [%6 :128]!    \n"

            "vmla.f32   q14, q6, %e21[0]        \n"
            "vmla.f32   q15, q7, %e21[0]        \n"

            "vmla.f32   q8, q4, %e18[1]         \n"
            "vmla.f32   q9, q5, %e18[1]         \n"

            "vmla.f32   q10, q4, %e19[1]        \n"
            "vmla.f32   q11, q5, %e19[1]        \n"

            "vmla.f32   q12, q4, %e20[1]        \n"
            "vmla.f32   q13, q5, %e20[1]        \n"

            "pld        [%7, #256]              \n"
            "vld1.f32   {d12-d15}, [%7 :128]!   \n"

            "vmla.f32   q14, q4, %e21[1]        \n"
            "vmla.f32   q15, q5, %e21[1]        \n"

            "vmla.f32   q8, q6, %f18[0]         \n"
            "vmla.f32   q9, q7, %f18[0]         \n"

            "vmla.f32   q10, q6, %f19[0]        \n"
            "vmla.f32   q11, q7, %f19[0]        \n"

            "vmla.f32   q12, q6, %f20[0]        \n"
            "vmla.f32   q13, q7, %f20[0]        \n"

            "pld        [%8, #256]              \n"
            "vld1.f32   {d8-d11}, [%8 :128]!    \n"

            "vmla.f32   q14, q6, %f21[0]        \n"
            "vmla.f32   q15, q7, %f21[0]        \n"

            "vmla.f32   q8, q4, %f18[1]         \n"
            "vmla.f32   q9, q5, %f18[1]         \n"

            "vmla.f32   q10, q4, %f19[1]        \n"
            "vmla.f32   q11, q5, %f19[1]        \n"

            "vmla.f32   q12, q4, %f20[1]        \n"
            "vst1.f32   {d16-d19}, [%1 :128]!   \n"

            "vmla.f32   q13, q5, %f20[1]        \n"

            "vst1.f32   {d20-d23}, [%2 :128]!   \n"

            "vmla.f32   q14, q4, %f21[1]        \n"
            "pld        [%5, #256]              \n"
            "vld1.f32   {d12-d15}, [%5 :128]!   \n"

            "vmla.f32   q15, q5, %f21[1]        \n"

            "vst1.f32   {d24-d27}, [%3 :128]!   \n"

            "pld        [%1, #256]              \n"
            "vld1.f32   {d16-d19}, [%1 :128]    \n"

            "subs       %0, #1                  \n"
            "vst1.f32   {d28-d31}, [%4 :128]!   \n"

            "bne        0b                      \n"
            "sub        %5, #32                 \n"
            : "=r"(nn),      // %0
            "=r"(outptr0), // %1
            "=r"(outptr1), // %2
            "=r"(outptr2), // %3
            "=r"(outptr3), // %4
            "=r"(r0),      // %5
            "=r"(r1),      // %6
            "=r"(r2),      // %7
            "=r"(r3)       // %8
            : "0"(nn),
            "w"(_k0), // %18
            "w"(_k1), // %19
            "w"(_k2), // %20
            "w"(_k3)  // %21
            : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");

// ARM64 codes

float* outptr0 = out0;
float* outptr1 = out1;
float* outptr2 = out2;
float* outptr3 = out3;

const float* img0 = bottom_blob.channel(q);
const float* img1 = bottom_blob.channel(q + 1);
const float* img2 = bottom_blob.channel(q + 2);
const float* img3 = bottom_blob.channel(q + 3);

const float* kernel0 = kernel + p * inch + q;
const float* kernel1 = kernel + (p + 1) * inch + q;
const float* kernel2 = kernel + (p + 2) * inch + q;
const float* kernel3 = kernel + (p + 3) * inch + q;

const float* r0 = img0;
const float* r1 = img1;
const float* r2 = img2;
const float* r3 = img3;

int size = outw * outh;

int nn = size >> 3;
int remain = size & 7;

float32x4_t _k0 = vld1q_f32(kernel0);
float32x4_t _k1 = vld1q_f32(kernel1);
float32x4_t _k2 = vld1q_f32(kernel2);
float32x4_t _k3 = vld1q_f32(kernel3);

if (nn > 0)
    asm volatile(
        "prfm   pldl1keep, [%5, #256]       \n"
        "ld1    {v6.4s, v7.4s}, [%5], #32   \n"

        "prfm   pldl1keep, [%1, #256]       \n"
        "ld1    {v8.4s, v9.4s}, [%1]        \n"

        "0:                                 \n"

        "fmla   v8.4s, v6.4s, %18.s[0]      \n"

        "prfm   pldl1keep, [%2, #256]       \n"
        "ld1    {v10.4s, v11.4s}, [%2]      \n"

        "fmla   v9.4s, v7.4s, %18.s[0]      \n"

        "fmla   v10.4s, v6.4s, %19.s[0]     \n"

        "prfm   pldl1keep, [%3, #256]       \n"
        "ld1    {v12.4s, v13.4s}, [%3]      \n"

        "fmla   v11.4s, v7.4s, %19.s[0]     \n"

        "fmla   v12.4s, v6.4s, %20.s[0]     \n"

        "prfm   pldl1keep, [%4, #256]       \n"
        "ld1    {v14.4s, v15.4s}, [%4]      \n"

        "fmla   v13.4s, v7.4s, %20.s[0]     \n"

        "prfm   pldl1keep, [%6, #256]       \n"
        "ld1    {v4.4s, v5.4s}, [%6], #32   \n"

        "fmla   v14.4s, v6.4s, %21.s[0]     \n"
        "fmla   v15.4s, v7.4s, %21.s[0]     \n"

        "fmla   v8.4s, v4.4s, %18.s[1]      \n"
        "fmla   v9.4s, v5.4s, %18.s[1]      \n"

        "fmla   v10.4s, v4.4s, %19.s[1]     \n"
        "fmla   v11.4s, v5.4s, %19.s[1]     \n"

        "fmla   v12.4s, v4.4s, %20.s[1]     \n"
        "fmla   v13.4s, v5.4s, %20.s[1]     \n"

        "prfm   pldl1keep, [%7, #256]       \n"
        "ld1    {v6.4s, v7.4s}, [%7], #32   \n"

        "fmla   v14.4s, v4.4s, %21.s[1]     \n"
        "fmla   v15.4s, v5.4s, %21.s[1]     \n"

        "fmla   v8.4s, v6.4s, %18.s[2]      \n"
        "fmla   v9.4s, v7.4s, %18.s[2]      \n"

        "fmla   v10.4s, v6.4s, %19.s[2]     \n"
        "fmla   v11.4s, v7.4s, %19.s[2]     \n"

        "fmla   v12.4s, v6.4s, %20.s[2]     \n"
        "fmla   v13.4s, v7.4s, %20.s[2]     \n"

        "prfm   pldl1keep, [%8, #256]       \n"
        "ld1    {v4.4s, v5.4s}, [%8], #32   \n"

        "fmla   v14.4s, v6.4s, %21.s[2]     \n"
        "fmla   v15.4s, v7.4s, %21.s[2]     \n"

        "fmla   v8.4s, v4.4s, %18.s[3]      \n"
        "fmla   v9.4s, v5.4s, %18.s[3]      \n"

        "fmla   v10.4s, v4.4s, %19.s[3]     \n"
        "fmla   v11.4s, v5.4s, %19.s[3]     \n"

        "st1    {v8.4s, v9.4s}, [%1], #32   \n"

        "fmla   v12.4s, v4.4s, %20.s[3]     \n"
        "fmla   v13.4s, v5.4s, %20.s[3]     \n"

        "st1    {v10.4s, v11.4s}, [%2], #32 \n"

        "prfm   pldl1keep, [%5, #256]       \n"
        "ld1    {v6.4s, v7.4s}, [%5], #32   \n"

        "fmla   v14.4s, v4.4s, %21.s[3]     \n"
        "fmla   v15.4s, v5.4s, %21.s[3]     \n"

        "st1    {v12.4s, v13.4s}, [%3], #32 \n"

        "prfm   pldl1keep, [%1, #256]       \n"
        "ld1    {v8.4s, v9.4s}, [%1]        \n"

        "subs   %w0, %w0, #1                \n"

        "st1    {v14.4s, v15.4s}, [%4], #32 \n"

        "bne    0b                          \n"
        "sub    %5, %5, #32                 \n"
        : "=r"(nn),      // %0
        "=r"(outptr0), // %1
        "=r"(outptr1), // %2
        "=r"(outptr2), // %3
        "=r"(outptr3), // %4
        "=r"(r0),      // %5
        "=r"(r1),      // %6
        "=r"(r2),      // %7
        "=r"(r3)       // %8
        : "0"(nn),
        "w"(_k0), // %18
        "w"(_k1), // %19
        "w"(_k2), // %20
        "w"(_k3)  // %21
        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");

These codes comes from NCNN project convolutions algorithm functions.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。



需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。