是我的 MIPS 编译器疯了，还是我选择了 MIPS 疯了？

发布于 2024-12-08 05:36:41 字数 2066 浏览 17 评论 0原文

我在嵌入式项目中使用 MIPS CPU (PIC32)，但我开始质疑我的选择。我知道像 MIPS 这样的 RISC CPU 会生成比预期更多的指令，但我没想到会是这样。这是反汇编列表中的一个片段：

225:                         LATDSET = 0x0040;
    sw          s1,24808(s2)
    sw          s4,24808(s2)
    sw          s4,24808(s2)
    sw          s1,24808(s2)
    sw          s4,24808(s3)
    sw          s4,24808(s3)
    sw          s1,24808(s3)

226:                         {

227:                             porte = PORTE;
    lw          t1,24848(s4)
    andi        v0,t1,0xffff
    lw          v1,24848(s6)
    andi        ra,v1,0xffff
    lw          v1,24848(s6)
    andi        ra,v1,0xffff
    lw          v0,24848(s6)
    andi        t2,v0,0xffff
    lw          a2,24848(s5)
    andi        v1,a2,0xffff
    lw          t2,24848(s5)
    andi        v1,t2,0xffff
    lw          v0,24848(s5)
    andi        t2,v0,0xffff

228:                             if (porte & 0x0004)
    andi        t2,v0,0x4
    andi        s8,ra,0x4
    andi        s8,ra,0x4
    andi        ra,t2,0x4
    andi        a1,v1,0x4
    andi        a2,v1,0x4
    andi        a2,t2,0x4

229:                                 pst_bytes_somi[0] |= sliding_bit;
    or          t3,t4,s0
    xori        a3,t2,0x0
    movz        t3,s0,a3
    addu        s0,t3,zero
    or          t3,t4,s1
    xori        a3,s8,0x0
    movz        t3,s1,a3
    addu        s1,t3,zero
    or          t3,t4,s1
    xori        a3,s8,0x0
    movz        t3,s1,a3
    addu        s1,t3,zero
    or          v1,t4,s0
    xori        a3,ra,0x0
    movz        v1,s0,a3
    addu        s0,v1,zero
    or          a0,t4,s2
    xori        a3,a1,0x0
    movz        a0,s2,a3
    addu        s2,a0,zero
    or          t3,t4,s2
    xori        a3,a2,0x0
    movz        t3,s2,a3
    addu        s2,t3,zero
    or          v1,t4,s0
    xori        a3,a2,0x0
    movz        v1,s0,a3

这似乎是用于在固定地址进行简单读/写和测试变量的大量指令。在不同的 CPU 上，我可能可以将每个 C 语句减少到大约 1..3 条指令，而无需求助于手写汇编。显然，时钟速率相当高，但它并不比我在不同CPU（例如dsPIC）中的时钟速率高10倍。

我已将优化设置为最大。我的 C 编译器很糟糕吗（gcc 3.4.4）？或者这是 MIPS 的典型特征？

原文

I am using a MIPS CPU (PIC32) in an embedded project, but I am starting to question my choice.
I understand that a RISC CPU like MIPS will generate more instructions than one might expect, but I didn't think it would be like this. Here is a snippet from the disassembly listing:

225:                         LATDSET = 0x0040;
    sw          s1,24808(s2)
    sw          s4,24808(s2)
    sw          s4,24808(s2)
    sw          s1,24808(s2)
    sw          s4,24808(s3)
    sw          s4,24808(s3)
    sw          s1,24808(s3)

226:                         {

227:                             porte = PORTE;
    lw          t1,24848(s4)
    andi        v0,t1,0xffff
    lw          v1,24848(s6)
    andi        ra,v1,0xffff
    lw          v1,24848(s6)
    andi        ra,v1,0xffff
    lw          v0,24848(s6)
    andi        t2,v0,0xffff
    lw          a2,24848(s5)
    andi        v1,a2,0xffff
    lw          t2,24848(s5)
    andi        v1,t2,0xffff
    lw          v0,24848(s5)
    andi        t2,v0,0xffff

228:                             if (porte & 0x0004)
    andi        t2,v0,0x4
    andi        s8,ra,0x4
    andi        s8,ra,0x4
    andi        ra,t2,0x4
    andi        a1,v1,0x4
    andi        a2,v1,0x4
    andi        a2,t2,0x4

229:                                 pst_bytes_somi[0] |= sliding_bit;
    or          t3,t4,s0
    xori        a3,t2,0x0
    movz        t3,s0,a3
    addu        s0,t3,zero
    or          t3,t4,s1
    xori        a3,s8,0x0
    movz        t3,s1,a3
    addu        s1,t3,zero
    or          t3,t4,s1
    xori        a3,s8,0x0
    movz        t3,s1,a3
    addu        s1,t3,zero
    or          v1,t4,s0
    xori        a3,ra,0x0
    movz        v1,s0,a3
    addu        s0,v1,zero
    or          a0,t4,s2
    xori        a3,a1,0x0
    movz        a0,s2,a3
    addu        s2,a0,zero
    or          t3,t4,s2
    xori        a3,a2,0x0
    movz        t3,s2,a3
    addu        s2,t3,zero
    or          v1,t4,s0
    xori        a3,a2,0x0
    movz        v1,s0,a3

This seems like a crazy number of instructions for simple reading / writing and testing variables at fixed addresses. On a different CPU, I could probably get each C statement down to about 1..3 instructions, without resorting to hand-written asm. Obviously the clock rate is fairly high, but it's not 10x higher than what I would have in a different CPU (e.g. dsPIC).

I have optimisation set to maximum. Is my C compiler terrible (It's gcc 3.4.4)? Or is this typical of MIPS?

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

鹿港巷口少年归 2024-12-15 05:36:41

终于找到答案了。反汇编列表完全具有误导性。编译器正在进行循环展开，我们在每个 C 语句下看到的实际上是 8 倍的指令数，因为它展开了 8 倍的循环。这些指令不是位于连续的地址！在编译器选项中关闭循环展开会产生以下结果：

225:                         LATDSET = 0x0040;
    sw          s3,24808(s2)
226:                         {
227:                             porte = PORTE;
    lw          t1,24848(s5)
    andi        v0,t1,0xffff
228:                             if (porte & 0x0004)
    andi        t2,v0,0x4
229:                                 pst_bytes_somi[0] |= sliding_bit;
    or          t3,t4,s0
    xori        a3,t2,0x0
    movz        t3,s0,a3
    addu        s0,t3,zero
230:

每个人都感到恐慌。

Finally figured out the answer. The disassembly listing is totally misleading. The compiler is doing loop unrolling, and what we're seeing under each C statement is actually 8x the number of instructions, because it's unrolling the loop 8x. The instructions are not at consecutive addresses! Turning off loop unrolling in the compiler options produces this:

225:                         LATDSET = 0x0040;
    sw          s3,24808(s2)
226:                         {
227:                             porte = PORTE;
    lw          t1,24848(s5)
    andi        v0,t1,0xffff
228:                             if (porte & 0x0004)
    andi        t2,v0,0x4
229:                                 pst_bytes_somi[0] |= sliding_bit;
    or          t3,t4,s0
    xori        a3,t2,0x0
    movz        t3,s0,a3
    addu        s0,t3,zero
230:

Panic over everyone.

回复收藏 0 原文

儭儭莪哋寶赑 2024-12-15 05:36:41

我认为你的编译器行为不正常......
例如检查以下语句：

228:                             if (porte & 0x0004)
    andi        t2,v0,0x4  (1)
    andi        s8,ra,0x4  (2)
    andi        s8,ra,0x4  (3)
    andi        ra,t2,0x4  (4)
    andi        a1,v1,0x4  (5)
    andi        a2,v1,0x4  (6)
    andi        a2,t2,0x4  (7)

很明显，有些指令基本上什么也不做。指令（3）没有做任何新的事情，而是将指令（2）计算出的相同结果存储在 s8 中。
指令 (6) 也无效，因为它被下一条指令 (7) 覆盖，
我相信任何进行静态分析阶段的编译器至少会删除指令（3）和（6）。

类似的分析也适用于代码的其他部分。例如，在第一个语句中，您可以看到一些寄存器（v0 和 v0）两次加载相同的值。

我认为你的编译器在优化编译代码方面做得不好。

I think your compiler is misbehaving...
Check for example this statement:

228:                             if (porte & 0x0004)
    andi        t2,v0,0x4  (1)
    andi        s8,ra,0x4  (2)
    andi        s8,ra,0x4  (3)
    andi        ra,t2,0x4  (4)
    andi        a1,v1,0x4  (5)
    andi        a2,v1,0x4  (6)
    andi        a2,t2,0x4  (7)

It is obvious that there are instructions that basically do nothing. Instruction (3) does nothing as new as stores in s8 the same result computed by instruction (2).
Instruction (6) also has no effect, as it is overriden by the next instruction (7),
I believe any compiler which does some static analysis phase would at least remove instructions (3) and (6).

Similar analysis would apply to other portions of your code. For example in the first statement you can see some registers (v0 and v0) is loaded with the same value twice.

I think your compiler is not doing a good job at optimizing the compiled code.

回复收藏 0 原文

猫九 2024-12-15 05:36:41

MIPS 基本上是 RISC 设计中所有愚蠢之处的体现。如今，x86（和 x86_64）已经吸收了 RISC 中几乎所有有价值的想法，并且 ARM 已经发展得比传统 RISC 更高效，同时仍然坚持保留小型系统指令集的 RISC 概念。

为了回答这个问题，我想说，你选择 MIPS 是疯狂的，或者更重要的是，在没有先了解一点 MIPS ISA 以及为什么它如此糟糕以及你需要忍受多少低效率的情况下就选择它你想使用它。在大多数情况下，我会选择 ARM 来实现低功耗/嵌入式系统，或者如果您能承受更多的功耗，那么选择 Intel Atom 会更好。

编辑：实际上，您可能会疯狂的第二个原因...从评论来看，您似乎正在使用 16 位整数。你不应该在 C 中使用小于 int 类型，除非在数组或将被大量分配的结构中（无论是在数组中还是在其他方式中，例如链接列表/树/ ETC。）。使用小类型永远不会带来任何好处，除了节省空间（除非你有大量这种类型的值，否则这是无关紧要的），并且几乎肯定比使用“普通”类型效率低。就 MIPS 而言，差异是极端的。切换到 int 并查看您的问题是否消失。

回复收藏 0 原文

日裸衫吸 2024-12-15 05:36:41

我唯一能想到的是，编译器可能会注入额外的无意义指令，以使 CPU 的速度与慢得多的数据总线速度相匹配。即使这样的解释也还不够充分，因为存储/加载指令同样具有冗余。

由于编译器是可疑的，所以不要忘记，将精力集中在编译器上可能会让您看不到某种隧道视野。也许工具链的其他部分也潜藏着错误。

你从哪里得到编译器？我发现一些“简单”的来源经常会提供一些非常糟糕的工具。我的嵌入式开发朋友通常会编译自己的工具链，有时会得到更好的结果。

回复收藏 0 原文

_蜘蛛 2024-12-15 05:36:41

我尝试使用 CodeSourcery MIPS GCC 4.4-303 和 -O4 编译以下代码。我用 uint32_t 和 uint16_t 进行了尝试：

#include <stdint.h>
void foo(uint32_t PORTE, uint32_t pst_bytes_somi[], uint32_t sliding_bit) {
    uint32_t LATDSET = 0x0040;
    {
        uint32_t porte = PORTE;
        if (porte & 0x0004)
            pst_bytes_somi[0] |= sliding_bit;
        if (porte & LATDSET)
            pst_bytes_somi[1] |= sliding_bit;
    }
}

这是使用 uint32_t 整数的反汇编：

        uint32_t porte = PORTE;
        if (porte & 0x0004)
   0:   30820004    andi    v0,a0,0x4
   4:   10400004    beqz    v0,18 <foo+0x18>
   8:   00000000    nop
./foo32.c:7
            pst_bytes_somi[0] |= sliding_bit;
   c:   8ca20000    lw  v0,0(a1)
  10:   00461025    or  v0,v0,a2
  14:   aca20000    sw  v0,0(a1)
./foo32.c:8
        if (porte & LATDSET)
  18:   30840040    andi    a0,a0,0x40
  1c:   10800004    beqz    a0,30 <foo+0x30>
  20:   00000000    nop
./foo32.c:9
            pst_bytes_somi[1] |= sliding_bit;
  24:   8ca20004    lw  v0,4(a1)
  28:   00463025    or  a2,v0,a2
  2c:   aca60004    sw  a2,4(a1)
  30:   03e00008    jr  ra
  34:   00000000    nop

这是使用 uint16_t 整数的反汇编：

        if (porte & 0x0004)
   4:   30820004    andi    v0,a0,0x4
   8:   10400004    beqz    v0,1c <foo+0x1c>
   c:   30c6ffff    andi    a2,a2,0xffff
./foo16.c:7
            pst_bytes_somi[0] |= sliding_bit;
  10:   94a20000    lhu v0,0(a1)
  14:   00c21025    or  v0,a2,v0
  18:   a4a20000    sh  v0,0(a1)
./foo16.c:8
        if (porte & LATDSET)
  1c:   30840040    andi    a0,a0,0x40
  20:   10800004    beqz    a0,34 <foo+0x34>
  24:   00000000    nop
./foo16.c:9
            pst_bytes_somi[1] |= sliding_bit;
  28:   94a20002    lhu v0,2(a1)
  2c:   00c23025    or  a2,a2,v0
  30:   a4a60002    sh  a2,2(a1)
  34:   03e00008    jr  ra
  38:   00000000    nop

正如您所看到的，每个 C 语句都映射为两到三个指令。使用 16 位整数使函数仅长一条指令。

I tried compiling the following code with CodeSourcery MIPS GCC 4.4-303 with -O4. I tried it with uint32_t and uint16_t:

#include <stdint.h>
void foo(uint32_t PORTE, uint32_t pst_bytes_somi[], uint32_t sliding_bit) {
    uint32_t LATDSET = 0x0040;
    {
        uint32_t porte = PORTE;
        if (porte & 0x0004)
            pst_bytes_somi[0] |= sliding_bit;
        if (porte & LATDSET)
            pst_bytes_somi[1] |= sliding_bit;
    }
}

Here is the disassembly with uint32_t integers:

        uint32_t porte = PORTE;
        if (porte & 0x0004)
   0:   30820004    andi    v0,a0,0x4
   4:   10400004    beqz    v0,18 <foo+0x18>
   8:   00000000    nop
./foo32.c:7
            pst_bytes_somi[0] |= sliding_bit;
   c:   8ca20000    lw  v0,0(a1)
  10:   00461025    or  v0,v0,a2
  14:   aca20000    sw  v0,0(a1)
./foo32.c:8
        if (porte & LATDSET)
  18:   30840040    andi    a0,a0,0x40
  1c:   10800004    beqz    a0,30 <foo+0x30>
  20:   00000000    nop
./foo32.c:9
            pst_bytes_somi[1] |= sliding_bit;
  24:   8ca20004    lw  v0,4(a1)
  28:   00463025    or  a2,v0,a2
  2c:   aca60004    sw  a2,4(a1)
  30:   03e00008    jr  ra
  34:   00000000    nop

Here is the disassembly with uint16_t integers:

        if (porte & 0x0004)
   4:   30820004    andi    v0,a0,0x4
   8:   10400004    beqz    v0,1c <foo+0x1c>
   c:   30c6ffff    andi    a2,a2,0xffff
./foo16.c:7
            pst_bytes_somi[0] |= sliding_bit;
  10:   94a20000    lhu v0,0(a1)
  14:   00c21025    or  v0,a2,v0
  18:   a4a20000    sh  v0,0(a1)
./foo16.c:8
        if (porte & LATDSET)
  1c:   30840040    andi    a0,a0,0x40
  20:   10800004    beqz    a0,34 <foo+0x34>
  24:   00000000    nop
./foo16.c:9
            pst_bytes_somi[1] |= sliding_bit;
  28:   94a20002    lhu v0,2(a1)
  2c:   00c23025    or  a2,a2,v0
  30:   a4a60002    sh  a2,2(a1)
  34:   03e00008    jr  ra
  38:   00000000    nop

As you can see each C statement maps into two to three instructions. Using 16 bit integers makes the function only one instruction longer.

回复收藏 0 原文