如何使用动态分支来跳过不必要的指令
我想使用动态分支来跳过不必要的指令。请考虑两个函数:
float computeFirst(float s)
{
[branch] if(abs(s) > 1.0)
return -1.0;
// a bunch of instructions
return acos(s); // acos just for example
}
float computeSecond(float s)
{
[branch] if(abs(s) > 1.0)
{
return -1.0;
}
else
{
// a bunch of instructions
return acos(s); // acos just for example
}
}
这些函数等价吗?两者都有动态分支,但它们的工作方式是否相同,并且实际上跳过了不必要的指令(当扭曲中的所有像素都遵循相同的分支时)?
使用Shader Playground,我发现这两个函数的编译方式不同:
// computeFirst
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.y, l(-1.000000)
endif
add r0.z, -|v0.x|, l(1.000000)
sqrt r0.z, r0.z
mad r0.w, |v0.x|, l(-0.018729), l(0.074261)
mad r0.w, r0.w, |v0.x|, l(-0.212114)
mad r0.w, r0.w, |v0.x|, l(1.570729)
mul r1.x, r0.z, r0.w
mad r1.x, r1.x, l(-2.000000), l(3.141593)
lt r1.y, v0.x, -v0.x
and r1.x, r1.y, r1.x
mad r0.z, r0.w, r0.z, r1.x
movc o0.x, r0.x, r0.y, r0.z
mov o0.yzw, l(0,0,0,0)
ret
// Approximately 17 instruction slots used
// computeSecond
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.x, l(-1.000000)
else
add r0.y, -|v0.x|, l(1.000000)
sqrt r0.y, r0.y
mad r0.z, |v0.x|, l(-0.018729), l(0.074261)
mad r0.z, r0.z, |v0.x|, l(-0.212114)
mad r0.z, r0.z, |v0.x|, l(1.570729)
mul r0.w, r0.y, r0.z
mad r0.w, r0.w, l(-2.000000), l(3.141593)
lt r1.x, v0.x, -v0.x
and r0.w, r0.w, r1.x
mad r0.x, r0.z, r0.y, r0.w
endif
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
ret
// Approximately 18 instruction slots used
在computeFirst中,动态分支看起来毫无用处,并且似乎永远不允许跳过不必要的指令。我是否误解了什么,这两个编译版本是否等效?
I would like to use dynamic branching to skip unnecessary instructions. Please consider two functions:
float computeFirst(float s)
{
[branch] if(abs(s) > 1.0)
return -1.0;
// a bunch of instructions
return acos(s); // acos just for example
}
float computeSecond(float s)
{
[branch] if(abs(s) > 1.0)
{
return -1.0;
}
else
{
// a bunch of instructions
return acos(s); // acos just for example
}
}
Are these functions equivalent? Both have the dynamic branch but do they work the same way and the unnecessary instructions are actually skipped (when all the pixels in a warp follow the same branch)?
Using Shader Playground, I found that these two functions compile differently:
// computeFirst
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.y, l(-1.000000)
endif
add r0.z, -|v0.x|, l(1.000000)
sqrt r0.z, r0.z
mad r0.w, |v0.x|, l(-0.018729), l(0.074261)
mad r0.w, r0.w, |v0.x|, l(-0.212114)
mad r0.w, r0.w, |v0.x|, l(1.570729)
mul r1.x, r0.z, r0.w
mad r1.x, r1.x, l(-2.000000), l(3.141593)
lt r1.y, v0.x, -v0.x
and r1.x, r1.y, r1.x
mad r0.z, r0.w, r0.z, r1.x
movc o0.x, r0.x, r0.y, r0.z
mov o0.yzw, l(0,0,0,0)
ret
// Approximately 17 instruction slots used
// computeSecond
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.x, l(-1.000000)
else
add r0.y, -|v0.x|, l(1.000000)
sqrt r0.y, r0.y
mad r0.z, |v0.x|, l(-0.018729), l(0.074261)
mad r0.z, r0.z, |v0.x|, l(-0.212114)
mad r0.z, r0.z, |v0.x|, l(1.570729)
mul r0.w, r0.y, r0.z
mad r0.w, r0.w, l(-2.000000), l(3.141593)
lt r1.x, v0.x, -v0.x
and r0.w, r0.w, r1.x
mad r0.x, r0.z, r0.y, r0.w
endif
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
ret
// Approximately 18 instruction slots used
In computeFirst, dynamic branch looks useless and never seems to allow unnecessary instructions to be skipped. Am I misunderstanding something and are these two compiled versions equivalent?
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
看起来computeFirst函数在没有branch属性的情况下得到了更好的优化。我刚刚添加了实际表达式而不是注释行,现在编译结果取决于使用动态分支可以跳过的指令数:
It seems that the computeFirst function is better optimized without the branch attribute. I just added the actual expressions instead of the comment line and now the compilation result depends on the number of instructions that can be skipped using dynamic branch: