如何使用动态分支来跳过不必要的指令

发布于 2025-01-17 05:32:36 字数 1877 浏览 1 评论 0原文

我想使用动态分支来跳过不必要的指令。请考虑两个函数：

float computeFirst(float s)
{
    [branch] if(abs(s) > 1.0)
        return -1.0;
    
    // a bunch of instructions
    
    return acos(s); // acos just for example
}

float computeSecond(float s)
{
    [branch] if(abs(s) > 1.0)
    {
        return -1.0;
    }
    else    
    {
        // a bunch of instructions
        
        return acos(s); // acos just for example
    }
}

这些函数等价吗？两者都有动态分支，但它们的工作方式是否相同，并且实际上跳过了不必要的指令（当扭曲中的所有像素都遵循相同的分支时）？

使用Shader Playground，我发现这两个函数的编译方式不同：

// computeFirst
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
  mov r0.y, l(-1.000000)
endif 
add r0.z, -|v0.x|, l(1.000000)
sqrt r0.z, r0.z
mad r0.w, |v0.x|, l(-0.018729), l(0.074261)
mad r0.w, r0.w, |v0.x|, l(-0.212114)
mad r0.w, r0.w, |v0.x|, l(1.570729)
mul r1.x, r0.z, r0.w
mad r1.x, r1.x, l(-2.000000), l(3.141593)
lt r1.y, v0.x, -v0.x
and r1.x, r1.y, r1.x
mad r0.z, r0.w, r0.z, r1.x
movc o0.x, r0.x, r0.y, r0.z
mov o0.yzw, l(0,0,0,0)
ret 
// Approximately 17 instruction slots used

// computeSecond
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
  mov r0.x, l(-1.000000)
else 
  add r0.y, -|v0.x|, l(1.000000)
  sqrt r0.y, r0.y
  mad r0.z, |v0.x|, l(-0.018729), l(0.074261)
  mad r0.z, r0.z, |v0.x|, l(-0.212114)
  mad r0.z, r0.z, |v0.x|, l(1.570729)
  mul r0.w, r0.y, r0.z
  mad r0.w, r0.w, l(-2.000000), l(3.141593)
  lt r1.x, v0.x, -v0.x
  and r0.w, r0.w, r1.x
  mad r0.x, r0.z, r0.y, r0.w
endif 
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
ret 
// Approximately 18 instruction slots used

在computeFirst中，动态分支看起来毫无用处，并且似乎永远不允许跳过不必要的指令。我是否误解了什么，这两个编译版本是否等效？

原文

I would like to use dynamic branching to skip unnecessary instructions. Please consider two functions:

float computeFirst(float s)
{
    [branch] if(abs(s) > 1.0)
        return -1.0;
    
    // a bunch of instructions
    
    return acos(s); // acos just for example
}

float computeSecond(float s)
{
    [branch] if(abs(s) > 1.0)
    {
        return -1.0;
    }
    else    
    {
        // a bunch of instructions
        
        return acos(s); // acos just for example
    }
}

Are these functions equivalent? Both have the dynamic branch but do they work the same way and the unnecessary instructions are actually skipped (when all the pixels in a warp follow the same branch)?

Using Shader Playground, I found that these two functions compile differently:

// computeFirst
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
  mov r0.y, l(-1.000000)
endif 
add r0.z, -|v0.x|, l(1.000000)
sqrt r0.z, r0.z
mad r0.w, |v0.x|, l(-0.018729), l(0.074261)
mad r0.w, r0.w, |v0.x|, l(-0.212114)
mad r0.w, r0.w, |v0.x|, l(1.570729)
mul r1.x, r0.z, r0.w
mad r1.x, r1.x, l(-2.000000), l(3.141593)
lt r1.y, v0.x, -v0.x
and r1.x, r1.y, r1.x
mad r0.z, r0.w, r0.z, r1.x
movc o0.x, r0.x, r0.y, r0.z
mov o0.yzw, l(0,0,0,0)
ret 
// Approximately 17 instruction slots used

// computeSecond
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
  mov r0.x, l(-1.000000)
else 
  add r0.y, -|v0.x|, l(1.000000)
  sqrt r0.y, r0.y
  mad r0.z, |v0.x|, l(-0.018729), l(0.074261)
  mad r0.z, r0.z, |v0.x|, l(-0.212114)
  mad r0.z, r0.z, |v0.x|, l(1.570729)
  mul r0.w, r0.y, r0.z
  mad r0.w, r0.w, l(-2.000000), l(3.141593)
  lt r1.x, v0.x, -v0.x
  and r0.w, r0.w, r1.x
  mad r0.x, r0.z, r0.y, r0.w
endif 
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
ret 
// Approximately 18 instruction slots used

In computeFirst, dynamic branch looks useless and never seems to allow unnecessary instructions to be skipped. Am I misunderstanding something and are these two compiled versions equivalent?

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

欢烬 2025-01-24 05:32:36

看起来computeFirst函数在没有branch属性的情况下得到了更好的优化。我刚刚添加了实际表达式而不是注释行，现在编译结果取决于使用动态分支可以跳过的指令数：

float computeFirst(float s)
{
    if(abs(s) > 1.0)
        return -1.0;
    
    s = acos(s) / acos(-1.0);
    s = acos(s) / acos(-1.0);
    s = acos(s) / acos(-1.0);
    s = acos(s) / acos(-1.0);
    s = acos(s) / acos(-1.0); // if you comment out this line, dynamic branching will not be used
    
    return s;
}

ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 1
ge r0.x, l(1.000000), |v0.x|
if_nz r0.x
  add r0.x, -|v0.x|, l(1.000000)
  sqrt r0.x, r0.x
  mad r0.y, |v0.x|, l(-0.018729), l(0.074261)
  mad r0.y, r0.y, |v0.x|, l(-0.212114)
  mad r0.y, r0.y, |v0.x|, l(1.570729)
  mul r0.z, r0.x, r0.y
  mad r0.z, r0.z, l(-2.000000), l(3.141593)
  lt r0.w, v0.x, -v0.x
  and r0.z, r0.w, r0.z
  mad r0.x, r0.y, r0.x, r0.z
  mul r0.y, r0.x, l(0.318310)
  mad r0.z, -r0.x, l(0.318310), l(1.000000)
  sqrt r0.z, r0.z
  mad r0.x, r0.x, l(-0.005962), l(0.074261)
  mad r0.x, r0.x, r0.y, l(-0.212114)
  mad r0.x, r0.x, r0.y, l(1.570729)
  mul r0.x, r0.z, r0.x
  mul r0.y, r0.x, l(0.318310)
  mad r0.z, -r0.x, l(0.318310), l(1.000000)
  sqrt r0.z, r0.z
  mad r0.x, r0.x, l(-0.005962), l(0.074261)
  mad r0.x, r0.x, r0.y, l(-0.212114)
  mad r0.x, r0.x, r0.y, l(1.570729)
  mul r0.x, r0.z, r0.x
  mul r0.y, r0.x, l(0.318310)
  mad r0.z, -r0.x, l(0.318310), l(1.000000)
  sqrt r0.z, r0.z
  mad r0.x, r0.x, l(-0.005962), l(0.074261)
  mad r0.x, r0.x, r0.y, l(-0.212114)
  mad r0.x, r0.x, r0.y, l(1.570729)
  mul r0.x, r0.z, r0.x
  mul r0.y, r0.x, l(0.318310)
  mad r0.z, -r0.x, l(0.318310), l(1.000000)
  sqrt r0.z, r0.z
  mad r0.x, r0.x, l(-0.005962), l(0.074261)
  mad r0.x, r0.x, r0.y, l(-0.212114)
  mad r0.x, r0.x, r0.y, l(1.570729)
  mul r0.x, r0.z, r0.x
  mul r0.x, r0.x, l(0.318310)
else 
  mov r0.x, l(-1.000000)
endif 
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
ret 
// Approximately 47 instruction slots used

It seems that the computeFirst function is better optimized without the branch attribute. I just added the actual expressions instead of the comment line and now the compilation result depends on the number of instructions that can be skipped using dynamic branch:

float computeFirst(float s)
{
    if(abs(s) > 1.0)
        return -1.0;
    
    s = acos(s) / acos(-1.0);
    s = acos(s) / acos(-1.0);
    s = acos(s) / acos(-1.0);
    s = acos(s) / acos(-1.0);
    s = acos(s) / acos(-1.0); // if you comment out this line, dynamic branching will not be used
    
    return s;
}

ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 1
ge r0.x, l(1.000000), |v0.x|
if_nz r0.x
  add r0.x, -|v0.x|, l(1.000000)
  sqrt r0.x, r0.x
  mad r0.y, |v0.x|, l(-0.018729), l(0.074261)
  mad r0.y, r0.y, |v0.x|, l(-0.212114)
  mad r0.y, r0.y, |v0.x|, l(1.570729)
  mul r0.z, r0.x, r0.y
  mad r0.z, r0.z, l(-2.000000), l(3.141593)
  lt r0.w, v0.x, -v0.x
  and r0.z, r0.w, r0.z
  mad r0.x, r0.y, r0.x, r0.z
  mul r0.y, r0.x, l(0.318310)
  mad r0.z, -r0.x, l(0.318310), l(1.000000)
  sqrt r0.z, r0.z
  mad r0.x, r0.x, l(-0.005962), l(0.074261)
  mad r0.x, r0.x, r0.y, l(-0.212114)
  mad r0.x, r0.x, r0.y, l(1.570729)
  mul r0.x, r0.z, r0.x
  mul r0.y, r0.x, l(0.318310)
  mad r0.z, -r0.x, l(0.318310), l(1.000000)
  sqrt r0.z, r0.z
  mad r0.x, r0.x, l(-0.005962), l(0.074261)
  mad r0.x, r0.x, r0.y, l(-0.212114)
  mad r0.x, r0.x, r0.y, l(1.570729)
  mul r0.x, r0.z, r0.x
  mul r0.y, r0.x, l(0.318310)
  mad r0.z, -r0.x, l(0.318310), l(1.000000)
  sqrt r0.z, r0.z
  mad r0.x, r0.x, l(-0.005962), l(0.074261)
  mad r0.x, r0.x, r0.y, l(-0.212114)
  mad r0.x, r0.x, r0.y, l(1.570729)
  mul r0.x, r0.z, r0.x
  mul r0.y, r0.x, l(0.318310)
  mad r0.z, -r0.x, l(0.318310), l(1.000000)
  sqrt r0.z, r0.z
  mad r0.x, r0.x, l(-0.005962), l(0.074261)
  mad r0.x, r0.x, r0.y, l(-0.212114)
  mad r0.x, r0.x, r0.y, l(1.570729)
  mul r0.x, r0.z, r0.x
  mul r0.x, r0.x, l(0.318310)
else 
  mov r0.x, l(-1.000000)
endif 
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
ret 
// Approximately 47 instruction slots used

回复收藏 0 原文

~没有更多了~