无法达到最佳性能

发布于 2024-12-24 02:06:05 字数 1381 浏览 0 评论 0原文

我试图通过下面的代码达到每个 SM 的峰值性能。峰值位于 25 GFlops（GTX275-GT200 Arch.）之间。此代码最多提供 8 GFlops。

__global__ void new_ker(float *x)
{
  int index = threadIdx.x+blockIdx.x*blockDim.x;
  float a,b;
  a=0;
  b=x[index];
  //LOOP=10000000
  //No. of blocks = 1
  //Threads per block = 512 (I'm using GTX 275 - GT200 Arch.)
  #pragma unroll 2048
  for(int i=0;i<LOOP;i++){
       a=a*b+b;
  }  

  x[index] = a;

 }

我不想在代码中增加 ILP。有什么想法为什么它没有达到顶峰吗？

int main(int argc,char **argv)
{

   //Initializations
   float *x;
   float *dx;
   cudaEvent_t new_start,new_stop;
   float elapsed;
   double gflops;
   x = 0;
   flag = 0;
   cudaMalloc((void **)&dx,sizeof(float)*THPB);

   //ILP=1  
   cudaEventCreate(&new_start);
   cudaEventCreate(&new_stop);
   printf("Kernel1:\n");
   cudaEventRecord(new_start, 0);
   new_ker<<<BLOCKS,THPB>>>(dx);
   cudaEventRecord(new_stop,0);
   cudaEventSynchronize(new_stop);
   cudaEventElapsedTime(&elapsed,new_start,new_stop);
   x = (float *)malloc(sizeof(float)*THPB);
   cudaMemcpy(x,dx,sizeof(float)*THPB,cudaMemcpyDeviceToHost);

   gflops = ((double)(BLOCKS)*(THPB)*LOOP/elapsed)/1000000;
   printf("\t%f",gflops);
   cudaEventDestroy(new_start);
   cudaEventDestroy(new_stop);
   return 0;
}

平台： CUDA 3.0 NVIDIA GeForce GTX275 (GT200)

原文

I'm trying to reach peak performance of each SM from the code below. The peak lies somewhere between 25 GFlops(GTX275-GT200 Arch.). This code gives 8 GFlops at the max.

__global__ void new_ker(float *x)
{
  int index = threadIdx.x+blockIdx.x*blockDim.x;
  float a,b;
  a=0;
  b=x[index];
  //LOOP=10000000
  //No. of blocks = 1
  //Threads per block = 512 (I'm using GTX 275 - GT200 Arch.)
  #pragma unroll 2048
  for(int i=0;i<LOOP;i++){
       a=a*b+b;
  }  

  x[index] = a;

 }

I don't want to increase ILP in the code. Any ideas why it's not reaching peak??

int main(int argc,char **argv)
{

   //Initializations
   float *x;
   float *dx;
   cudaEvent_t new_start,new_stop;
   float elapsed;
   double gflops;
   x = 0;
   flag = 0;
   cudaMalloc((void **)&dx,sizeof(float)*THPB);

   //ILP=1  
   cudaEventCreate(&new_start);
   cudaEventCreate(&new_stop);
   printf("Kernel1:\n");
   cudaEventRecord(new_start, 0);
   new_ker<<<BLOCKS,THPB>>>(dx);
   cudaEventRecord(new_stop,0);
   cudaEventSynchronize(new_stop);
   cudaEventElapsedTime(&elapsed,new_start,new_stop);
   x = (float *)malloc(sizeof(float)*THPB);
   cudaMemcpy(x,dx,sizeof(float)*THPB,cudaMemcpyDeviceToHost);

   gflops = ((double)(BLOCKS)*(THPB)*LOOP/elapsed)/1000000;
   printf("\t%f",gflops);
   cudaEventDestroy(new_start);
   cudaEventDestroy(new_stop);
   return 0;
}

Platform:
CUDA 3.0
NVIDIA GeForce GTX275 (GT200)

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

醉南桥 2024-12-31 02:06:05

如果我使用正确的 FLOP 计算将您的代码中的完整重现案例放在一起：

#include <stdio.h> 

#define LOOP (10000000)
#define BLOCKS (30)
#define THPB (512)

__global__ void new_ker(float *x)
{
  int index = threadIdx.x+blockIdx.x*blockDim.x;
  float a,b;
  a=0;
  b=x[index];
  #pragma unroll 2048
  for(int i=0;i<LOOP;i++){
       a=a*b+b;
  }  

  x[index] = a;
}

int main(int argc,char **argv)
{

   //Initializations
   float *x;
   float *dx;
   cudaEvent_t new_start,new_stop;
   float elapsed;
   double gflops;
   x = 0;
   cudaMalloc((void **)&dx,sizeof(float)*THPB);

   //ILP=1  
   cudaEventCreate(&new_start);
   cudaEventCreate(&new_stop);
   printf("Kernel1:\n");
   cudaEventRecord(new_start, 0);
   new_ker<<<BLOCKS,THPB>>>(dx);
   cudaEventRecord(new_stop,0);
   cudaEventSynchronize(new_stop);
   cudaEventElapsedTime(&elapsed,new_start,new_stop);
   x = (float *)malloc(sizeof(float)*THPB*BLOCKS);
   cudaMemcpy(x,dx,sizeof(float)*THPB*BLOCKS,cudaMemcpyDeviceToHost);

   gflops = 2.0e-6 * ((double)(LOOP)*double(THPB*BLOCKS)/(double)elapsed);
   printf("\t%f\n",gflops);
   cudaEventDestroy(new_start);
   cudaEventDestroy(new_stop);
   return 0;
}

然后我编译它并在 64 位 Linux 平台上使用 CUDA 3.2 的 1.4GHz GTX275 上运行它：

$ nvcc -arch=sm_13 -Xptxas="-v" -o perf perf.cu
ptxas info    : Compiling entry function '_Z7new_kerPf' for 'sm_13'
ptxas info    : Used 4 registers, 8+16 bytes smem, 8 bytes cmem[1]
$ ./perf 
Kernel1:
        671.806039

我得到峰值 FLOP/s 的 0.01% 以内对于运行纯 FMAD 代码的卡（1.4 GHz * 2 FLOP * 8 核/MP * 30 MP）= 672 GFLOP/秒。

因此，看起来代码实际上确实达到了每个多处理器一个块的峰值 FLOP/s，但您只是没有正确计算 FLOP/s 数。

If I put together a complete repro case from your code, using the correct FLOP calculation:

#include <stdio.h> 

#define LOOP (10000000)
#define BLOCKS (30)
#define THPB (512)

__global__ void new_ker(float *x)
{
  int index = threadIdx.x+blockIdx.x*blockDim.x;
  float a,b;
  a=0;
  b=x[index];
  #pragma unroll 2048
  for(int i=0;i<LOOP;i++){
       a=a*b+b;
  }  

  x[index] = a;
}

int main(int argc,char **argv)
{

   //Initializations
   float *x;
   float *dx;
   cudaEvent_t new_start,new_stop;
   float elapsed;
   double gflops;
   x = 0;
   cudaMalloc((void **)&dx,sizeof(float)*THPB);

   //ILP=1  
   cudaEventCreate(&new_start);
   cudaEventCreate(&new_stop);
   printf("Kernel1:\n");
   cudaEventRecord(new_start, 0);
   new_ker<<<BLOCKS,THPB>>>(dx);
   cudaEventRecord(new_stop,0);
   cudaEventSynchronize(new_stop);
   cudaEventElapsedTime(&elapsed,new_start,new_stop);
   x = (float *)malloc(sizeof(float)*THPB*BLOCKS);
   cudaMemcpy(x,dx,sizeof(float)*THPB*BLOCKS,cudaMemcpyDeviceToHost);

   gflops = 2.0e-6 * ((double)(LOOP)*double(THPB*BLOCKS)/(double)elapsed);
   printf("\t%f\n",gflops);
   cudaEventDestroy(new_start);
   cudaEventDestroy(new_stop);
   return 0;
}

And I compile it and run it on a 1.4GHz GTX275 with CUDA 3.2 on a 64 bit linux platform:

$ nvcc -arch=sm_13 -Xptxas="-v" -o perf perf.cu
ptxas info    : Compiling entry function '_Z7new_kerPf' for 'sm_13'
ptxas info    : Used 4 registers, 8+16 bytes smem, 8 bytes cmem[1]
$ ./perf 
Kernel1:
        671.806039

I get within 0.01% of peak FLOP/s for that card running a pure FMAD code (1.4 GHz * 2 FLOP * 8 cores/MP * 30 MP) = 672 GFLOP/s.

So it seems that the code does, in fact, hit peak FLOP/s with one block per multiprocessor, but you just are not calculating the FLOP/s number correctly.

回复收藏 0 原文

~没有更多了~