PLD 对 A9 皮质没有影响
我正在使用以下程序来检查 PLD
对性能的影响。但是,我无法发现我编写的 C 代码使用和不使用 PLD
时的性能差异。我是否缺少任何内容或需要添加任何编译器选项?
int arra[6144] = {0}; /*15kb*/
int arrb[6144] = {0}; /*15kb*/
int arrc[6144] = {0}; /*15kb*/
int arrd[2048] = {0}; /*5kb*/
int arre[2048] = {0}; /*5kb*/
int arrf[2048] = {0}; /*5kb*/
int arrg[2048] = {0}; /*5kb*/
int arrh[2048] = {0}; /*5kb*/
int arri[2048] = {0}; /*5kb*/
int arrj[2048] = {0}; /*5kb*/
int arrk[2048] = {0}; /*5kb*/
int arrl[2048] = {0}; /*5kb*/
int main()
{
int csize;
int i,z = 3;
int loop_i;
int32x4_t viarrd,viarre,viarrf;
int32x4_t viarrg,viarrh,viarri;
int32x4_t viarrj,viarrk,viarrl;
asm("LDR r1, =arrd");
asm("LDR r2, =arre");
asm("LDR r3, =arrf");
asm("LDR r4, =arrg");
asm("LDR r5, =arrh");
asm ("PLD [r1]");
asm ("PLD [r2]");
asm ("PLD [r3]");
asm ("PLD [r4]");
asm ("PLD [r5]");
for(loop_i=0;loop_i<100;loop_i++)
{
for(i=0;i<2048;i++)
{
arrd[i] = 5;
arre[i] = 5;
arrf[i] = 5;
arrg[i] = 5;
arrh[i] = 5;
arri[i] = 5;
arrj[i] = 5;
arrk[i] = 5;
arrl[i] = 5;
}
for(i=0;i<2048;i+=4)
{
viarrf = vld1q_s32(&arrf[i]);
viarre = vld1q_s32(&arre[i]);
viarrd = vmulq_s32(viarrf,viarre);
vst1q_s32(&arrd[i],viarrd);
}
for(i=0;i<2048;i+=4)
{
viarrg = vld1q_s32(&arrg[i]);
viarrh = vld1q_s32(&arrh[i]);
viarri = vmulq_s32(viarrg,viarrh);
vst1q_s32(&arri[i],viarri);
}
for(i=0;i<2048;i+=4)
{
viarrj = vld1q_s32(&arrj[i]);
viarrk = vld1q_s32(&arrk[i]);
viarrl = vmulq_s32(viarrj,viarrk);
vst1q_s32(&arrl[i],viarrl);
}
for(i=0;i<2048;i+=4)
{
viarrd = vld1q_s32(&arrd[i]);
viarrf = vld1q_s32(&arrf[i]);
viarre = vmulq_s32(viarrd,viarrf);
vst1q_s32(&arre[i],viarre);
}
for(i=0;i<2048;i+=4)
{
viarrg = vld1q_s32(&arrg[i]);
viarri = vld1q_s32(&arri[i]);
viarrh = vmulq_s32(viarrg,viarri);
vst1q_s32(&arrh[i],viarrh);
}
}
I am using the following program to check the effect of PLD
on performance. However, I couldn't find the difference in performance with and without PLD
the C code I've written. Is there anything I am missing or any compiler option I need to add?
int arra[6144] = {0}; /*15kb*/
int arrb[6144] = {0}; /*15kb*/
int arrc[6144] = {0}; /*15kb*/
int arrd[2048] = {0}; /*5kb*/
int arre[2048] = {0}; /*5kb*/
int arrf[2048] = {0}; /*5kb*/
int arrg[2048] = {0}; /*5kb*/
int arrh[2048] = {0}; /*5kb*/
int arri[2048] = {0}; /*5kb*/
int arrj[2048] = {0}; /*5kb*/
int arrk[2048] = {0}; /*5kb*/
int arrl[2048] = {0}; /*5kb*/
int main()
{
int csize;
int i,z = 3;
int loop_i;
int32x4_t viarrd,viarre,viarrf;
int32x4_t viarrg,viarrh,viarri;
int32x4_t viarrj,viarrk,viarrl;
asm("LDR r1, =arrd");
asm("LDR r2, =arre");
asm("LDR r3, =arrf");
asm("LDR r4, =arrg");
asm("LDR r5, =arrh");
asm ("PLD [r1]");
asm ("PLD [r2]");
asm ("PLD [r3]");
asm ("PLD [r4]");
asm ("PLD [r5]");
for(loop_i=0;loop_i<100;loop_i++)
{
for(i=0;i<2048;i++)
{
arrd[i] = 5;
arre[i] = 5;
arrf[i] = 5;
arrg[i] = 5;
arrh[i] = 5;
arri[i] = 5;
arrj[i] = 5;
arrk[i] = 5;
arrl[i] = 5;
}
for(i=0;i<2048;i+=4)
{
viarrf = vld1q_s32(&arrf[i]);
viarre = vld1q_s32(&arre[i]);
viarrd = vmulq_s32(viarrf,viarre);
vst1q_s32(&arrd[i],viarrd);
}
for(i=0;i<2048;i+=4)
{
viarrg = vld1q_s32(&arrg[i]);
viarrh = vld1q_s32(&arrh[i]);
viarri = vmulq_s32(viarrg,viarrh);
vst1q_s32(&arri[i],viarri);
}
for(i=0;i<2048;i+=4)
{
viarrj = vld1q_s32(&arrj[i]);
viarrk = vld1q_s32(&arrk[i]);
viarrl = vmulq_s32(viarrj,viarrk);
vst1q_s32(&arrl[i],viarrl);
}
for(i=0;i<2048;i+=4)
{
viarrd = vld1q_s32(&arrd[i]);
viarrf = vld1q_s32(&arrf[i]);
viarre = vmulq_s32(viarrd,viarrf);
vst1q_s32(&arre[i],viarre);
}
for(i=0;i<2048;i+=4)
{
viarrg = vld1q_s32(&arrg[i]);
viarri = vld1q_s32(&arri[i]);
viarrh = vmulq_s32(viarrg,viarri);
vst1q_s32(&arrh[i],viarrh);
}
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
您的描述显示为 Cortex-A9,但标签上显示为 Cortex-A8 - 这是哪一个?在 Cortex-A8 pld 上,仅加载到 L2,并且您的数据集已经适合 L2,因此,如果它已经存在,则不会从预加载中受益。
也就是说,无论您的代码是在 Cortex-A8 还是 A9 上,您的代码都不会完成很多任务,因为单个 pld 只会加载单个缓存行(32-64 字节);它不会告诉 CPU 在此之后永远保持预取行。 pld 指令的有效用法是在循环迭代中发出它,以便它指向当前加载位置之前的多个缓存行。理想情况下,您应该构建循环,以便每个 pld 完成一个缓存行的加载,以避免冗余。此外,您还可以将数据集与缓存行宽度对齐。
然而,Cortex-A9 有一个自动预取器,可以检测步幅。如果您使用的是 Cortex-A9 并且此功能已打开,pld 可能不会有太大帮助或根本没有帮助,而只会浪费时间通过管道。
Your description says Cortex-A9 but the tag says Cortex-A8 - which is it? On Cortex-A8 pld only loads to L2 and your data set already fits in L2, so if it's already there it won't benefit from preloading.
That said, your code wouldn't accomplish an awful lot regardless of whether or not it's on Cortex-A8 or A9 because a single pld will only load a single cache line (32-64 bytes); it won't tell the CPU to keep prefetching lines after that forever. An effective usage of the pld instruction, is to issue it inside your loop iteration such that it's pointing multiple cache lines ahead of where you're currently loading from. Ideally you'd structure your loop such that one cache line's worth of loads are done per pld, in order to avoid redundant ones. Also, you'd align your data sets to cache line width.
However, Cortex-A9 has an automatic prefetcher that will detect strides. If you are on Cortex-A9 and this feature is turned on the pld might not be helping much or at all, and will instead just waste time going through the pipeline.