无法在单个内核代码中找到平均值和方差

发布于 2024-10-01 06:45:21 字数 15000 浏览 1 评论 0原文

亲爱的学者们,
我无法在单个内核调用中实现mean和var。

目标:我需要找到矩阵中子矩阵的均值和方差。所以我写了以下内核

函数:


    global void kernelMean(d_a, d_mean, ...);
    global void kernelVar(d_a, d_var, ...);
    global void kernelMeanVar(d_a, d_mean, d_var,...);

问题: a)如果我单独计算 kernelMean 和 kernelVar ,它工作得很好 b) 如果我想在单个内核中计算 kernelMeanVar 是行不通的。

以下是我的代码,请让我知道可能的任何错误。

预先感谢您。

问候,
纳加拉朱


global void kernelMean(float* device_array, float* device_mean, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
      int block_id = blockIdx.x + gridDim.x * blockIdx.y;
      int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
      int my_rowIdx = thread_id/globalCols ;
      int my_colIdx = thread_id%globalCols ;
      int i,j;
      float temp;
      float sum = 0;
      float sumsq = 0.0;
      float mean;
      float ltotal_elts = (float) (localRows*localCols);

  device_mean[thread_id] = 0;
  if(thread_id <total_num_threads)
  {
    for(i=0; i < localRows ; i++)
    {
      for(j=0 ; j < localCols; j++)
      {
        temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
        sumsq = sumsq + (temp*temp);
        sum = sum + temp;
      }
    }
    mean = sum/ltotal_elts;
    device_mean[thread_id] = mean;

} } 全局 void kernelVar(float* device_array,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads) { int block_id = blockIdx.x + gridDim.x * blockIdx.y; int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x; int my_rowIdx = thread_id/globalCols ; int my_colIdx = thread_id%globalCols ; 整数 i,j; 浮动温度; 浮点总和=0; 浮点总和=0; 浮点数平均值 = 0; 浮动变量 = 0; 浮动 ltotal_elts = (浮动) localRows*localCols;

  device_var[thread_id] = 0;
  if(thread_id < total_num_threads)
  {
    for(i=0; i < localRows ; i++)
    {
      for(j=0 ; j < localCols; j++)
      {
        temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
        sum = sum + temp;
        sumsq = sumsq + (temp*temp);
      }
    }
    mean = sum/ltotal_elts;
    device_var[thread_id]  = (sumsq/ltotal_elts) - (mean*mean);
  }
}

全局 void kernelMeanVar(float* device_array, float* device_mean,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads) { int block_id = blockIdx.x + gridDim.x * blockIdx.y; int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x; int my_rowIdx = thread_id/globalCols ; int my_colIdx = thread_id%globalCols ; 整数 i,j; 浮动温度; 浮点总和=0; 浮点数总和 = 0.0; 浮动平均值; 浮动 ltotal_elts = (浮动) (localRows*localCols); device_mean[线程id] = 0; device_var[线程id] = 0; if(thread_id < 总线程数) { for(i=0; i < localRows ; i++) { for(j=0 ; j < localCols; j++) { temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)]; sumsq = sumsq + (temp*temp); 总和=总和+温度; } } 平均值=总和/ltotal_elts; device_mean[thread_id] = 平均值; device_var[thread_id] = (sumsq/ltotal_elts) - (平均值*平均值); } }

启用后的内核调用函数


void convertToFloat(float** float_ary, double* double_ary, int num_elts)
{
  for(int i = 0; i < num_elts ; i++)
  {
    (*float_ary)[i] = (float) double_ary[i];
     //printf("float_ary[%d] : %f \n", i, (*float_ary)[i]);
  }
  return;
}
void convertToDouble(double** double_ary, float* float_ary, int num_elts)
{
  for(int i = 0; i < num_elts ; i++)
  {
    (*double_ary)[i] = (double) float_ary[i];
  }
  return;
}
void computeMeanAndVarArray(double* host_array, int num_elts, int globalRows, int globalCols, int localRows, int localCols, int numRowElts, double** mean_ary, double** var_ary)
{
  float* host_array_float;
  float* device_array;
  float* host_mean;
  float* host_var;
  float* device_mean;
  float* device_var;
  double total_bytes =0;
  host_array_float = (float*) malloc (num_elts*sizeof(float));
  convertToFloat(&host_array_float, host_array, num_elts);
  //printf("num_elts %d \n", num_elts);
  cudaMalloc((void**) &device_array, sizeof(float)* num_elts);
  cudaMemset(device_array, 0, sizeof(float)* num_elts);
  cudaMemcpy(device_array, host_array_float,sizeof(float)* num_elts, cudaMemcpyHostToDevice);
  int numBlockThreads = MAX_THREADS_PER_BLOCK;
  int num_blocks = 0;
  int remain_elts = 0;
  int total_num_threads = globalRows * globalCols;
  cudaMalloc((void**) &device_mean, sizeof(float)* total_num_threads);
  cudaMemset(device_mean, 0, sizeof(float)* total_num_threads);
  cudaMalloc((void**) &device_var, sizeof(float)* total_num_threads);
  cudaMemset(device_var, 0, sizeof(float)* total_num_threads);
  num_blocks  =  total_num_threads/numBlockThreads;
  remain_elts =  total_num_threads%numBlockThreads;
  if(remain_elts > 0)
  {
    num_blocks++;
  }
  dim3 gridDim(num_blocks,1);
  dim3 blockDim(numBlockThreads,1);
  //kernelMean<<< gridDim,blockDim >>>(device_array, device_mean,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  //kernelVar<<< gridDim,blockDim >>>(device_array, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  kernelMeanVar<<< gridDim,blockDim >>>(device_array, device_mean, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  host_mean = (float*) malloc( sizeof(float) * total_num_threads);
  memset(host_mean, 0, sizeof(float) * total_num_threads);
  host_var = (float*) malloc( sizeof(float) * total_num_threads);
  memset(host_var, 0, sizeof(float) * total_num_threads);
  cudaThreadSynchronize();
  cudaError_t error = cudaGetLastError();
   //if(error!=cudaSuccess) {
      printf("ERROR: %s\n", cudaGetErrorString(error) );
   //}
  cudaMemcpy(host_mean, device_mean, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
  convertToDouble(mean_ary, host_mean, total_num_threads);
  cudaMemcpy(host_var, device_var, sizeof(float)*total_num_threads,     cudaMemcpyDeviceToHost);
  for(int i = 0 ; i < 300 ; i++)
    printf("host_var[%d] %f \n",i, host_var[i]);
  convertToDouble(var_ary, host_var, total_num_threads);
  cudaFree(device_array);
  cudaFree(device_mean);
  cudaFree(device_var);
  free(host_mean);
  free(host_var);
  free(host_array_float);
}

结果


 global void kernelMean(d_a, d_mean, ...);
 global void kernelVar(d_a, d_var, ...);

错误:没有错误 主机变量[0] 4.497070 主机变量[1] 5.061768 主机变量[2] 5.687500 主机变量[3] 6.347534 主机变量[4] 6.829102 主机变量[5] 12.940308 主机变量[6] 14.309937 主机变量[7] 15.141113 主机变量[8] 18.741577 主机变量[9] 21.323608 主机变量[10] 21.727417 主机变量[11] 192.348389 主机变量[12] 579.911621 主机变量[13] 800.821045 主机变量[14] 1071.960938 主机变量[15] 1077.261719 主机变量[16] 993.262207 主机变量[17] 924.379883 主机变量[18] 839.437012 主机变量[19] 810.847656 主机变量[20] 835.007813 主机变量[21] 1124.365723 主机变量[22] 1241.685547 主机变量[23] 1376.504150 主机变量[24] 1196.745850 主机变量[25] 1097.473877 主机变量[26] 1008.840088 主机变量[27] 867.585083 主机变量[28] 794.241699 主机变量[29] 1322.409790 主机变量[30] 1556.029785 主机变量[31] 1564.997803 主机变量[32] 1870.985840 主机变量[33] 1929.829590 主机变量[34] 1822.189453 主机变量[35] 1662.321777 主机变量[36] 1372.886719 主机变量[37] 1074.727539 主机变量[38] 833.003906 主机变量[39] 632.514648 主机变量[40] 380.227539 主机变量[41] 87.345703 主机变量[42] 82.544922 主机变量[43] 78.756836 主机变量[44] 68.541016 主机变量[45] 61.981445 主机变量[46] 60.413086 主机变量[47] 60.128906 主机变量[48] 59.767578 主机变量[49] 59.223633 主机变量[50] 56.569336 主机变量[51] 53.866211 主机变量[52] 51.186523 主机变量[53] 55.270508 主机变量[54] 59.956055 主机变量[55] 66.516602 主机变量[56] 70.348633 主机变量[57] 71.706055 主机变量[58] 70.494141 主机变量[59] 69.897461 主机变量[60] 66.286133 主机变量[61] 67.926758 主机变量[62] 160.753906 主机变量[63] 447.221191 主机变量[64] 831.740723 主机变量[65] 1076.513672 主机变量[66] 1193.666992 主机变量[67] 1208.239746 主机变量[68] 1126.845947 主机变量[69] 948.397461 主机变量[70] 669.399414 主机变量[71] 340.465576 主机变量[72] 67.161865 主机变量[73] 7.421082 主机变量[74] 5.485626 主机变量[75] 5.135620 主机变量[76] 3.460419 主机变量[77] 3.853577 主机变量[78] 5.221100 主机变量[79] 5.890381 主机变量[80] 7.139618 主机变量[81] 7.517609 主机变量[82] 6.865875 主机变量[83] 5.053909 主机变量[84] 2.781616 主机变量[85] 2.021912 主机变量[86] 2.130417 主机变量[87] 3.113586 主机变量[88] 4.024399 主机变量[89] 4.582413 主机变量[90] 4.077118 主机变量[91] 3.024384 主机变量[92] 2.287506 主机变量[93] 1.793579 主机变量[94] 1.567474 主机变量[95] 1.829895 主机变量[96] 2.325928 主机变量[97] 3.429993 主机变量[98] 3.885559 主机变量[99] 3.835602 主机变量[100] 5.566406 主机变量[101] 8.065582 主机变量[102] 18.767456 主机变量[103] 35.395599 主机变量[104] 64.148407 主机变量[105] 125.937866 主机变量[106] 176.445618 主机变量[107] 216.073059 主机变量[108] 272.109985 主机变量[109] 307.972412 主机变量[110] 289.652344 主机变量[111] 238.253662 主机变量[112] 178.304932 主机变量[113] 116.925049 主机变量[114] 74.773926 主机变量[115] 61.227295 主机变量[116] 55.238525 主机变量[117] 55.387451 主机变量[118] 49.241699 主机变量[119] 38.396240 主机变量[120] 28.304932 主机变量[121] 20.225342 主机变量[122] 18.043457 主机变量[123] 21.418457 主机变量[124] 26.120117 主机变量[125] 25.899414 主机变量[126] 26.641602 主机变量[127] 23.747437 主机变量[128] 18.927368 主机变量[129] 21.664307 主机变量[130] 142.432373 主机变量[131] 1575.141602 主机变量[132] 2901.855957 主机变量[133] 4195.149902 主机变量[134] 5047.758789 主机变量[135] 5450.164063 主机变量[136] 5249.767578 主机变量[137] 4577.365234 主机变量[138] 3352.496094 主机变量[139] 1641.593750 主机变量[140] 352.242188 主机变量[141] 224.824219 主机变量[142] 194.578125 主机变量[143] 178.875000 主机变量[144] 175.148438 主机变量[145] 174.117188 主机变量[146] 172.707031 主机变量[147] 169.578125 主机变量[148] 176.308594 主机变量[149] 181.968750 主机变量[150] 191.507813 主机变量[151] 198.500000 主机变量[152] 206.824219 主机变量[153] 213.273438 主机变量[154] 220.312500 主机变量[155] 218.859375 主机变量[156] 213.941406 主机变量[157] 205.474609 主机变量[158] 190.722656 主机变量[159] 178.414063 主机变量[160] 169.302734 主机变量[161] 3.750366 主机变量[162] 4.333252 主机变量[163] 4.901855 主机变量[164] 5.527466 主机变量[165] 6.201782 主机变量[166] 11.921631 主机变量[167] 14.135376 主机变量[168] 14.885864 主机变量[169] 19.083618 主机变量[170] 21.290283 主机变量[171] 21.415649 主机变量[172] 209.747559 主机变量[173] 580.304932 主机变量[174] 800.949951 主机变量[175] 1119.857422 主机变量[176] 1129.382324 主机变量[177] 1032.616211 主机变量[178] 972.797363 主机变量[179] 915.440918 主机变量[180] 905.890137 主机变量[181] 943.649902 主机变量[182] 1207.445801 主机变量[183]​​ 1345.912109 主机变量[184] 1478.704590 主机变量[185] 1224.895508 主机变量[186] 1105.403564 主机变量[187] 1031.981201 主机变量[188] 914.456421 主机变量[189] 835.127441 主机变量[190] 1320.454102 主机变量[191] 1561.439941 主机变量[192] 1599.149902 主机变量[193] 1912.232910 主机变量[194] 1993.473145 主机变量[195] 1913.377441 主机变量[196] 1784.035645 主机变量[197] 1554.712891 主机变量[198] 1244.698242 host_var[199] 926.668945

启用结果


global void kernelMeanVar(d_a, d_mean, d_var,...);

错误:没有错误 主机变量[0] 0.000000 主机变量[1] 0.000000 主机变量[2] 0.000000 主机变量[3] 0.000000 主机变量[4] 0.000000 主机变量[5] 0.000000 主机变量[6] 0.000000 主机变量[7] 0.000000 主机变量[8] 0.000000 主机变量[9] 0.000000 主机变量[10] 0.000000 主机变量[11] 0.000000 主机变量[12] 0.000000 主机变量[13] 0.000000 主机变量[14] 0.000000 主机变量[15] 0.000000 主机变量[16] 0.000000 主机变量[17] 0.000000 主机变量[18] 0.000000 主机变量[19] 0.000000 主机变量[20] 0.000000 主机变量[21] 0.000000 主机变量[22] 0.000000 主机变量[23] 0.000000 主机变量[24] 0.000000 主机变量[25] 0.000000 主机变量[26] 0.000000 主机变量[27] 0.000000 主机变量[28] 0.000000 主机变量[29] 0.000000 主机变量[30] 0.000000 主机变量[31] 0.000000 主机变量[32] 0.000000 主机变量[33] 0.000000 主机变量[34] 0.000000 主机变量[35] 0.000000 主机变量[36] 0.000000 主机变量[37] 0.000000 主机变量[38] 0.000000 主机变量[39] 0.000000 主机变量[40] 0.000000 主机变量[41] 0.000000 主机变量[42] 0.000000 主机变量[43] 0.000000 主机变量[44] 0.000000 主机变量[45] 0.000000 主机变量[46] 0.000000 主机变量[47] 0.000000 主机变量[48] 0.000000 主机变量[49] 0.000000 主机变量[50] 0.000000 主机变量[51] 0.000000 主机变量[52] 0.000000 主机变量[53] 0.000000 主机变量[54] 0.000000 主机变量[55] 0.000000 主机变量[56] 0.000000 主机变量[57] 0.000000 主机变量[58] 0.000000 主机变量[59] 0.000000 主机变量[60] 0.000000 主机变量[61] 0.000000 主机变量[62] 0.000000 主机变量[63] 0.000000 主机变量[64] 0.000000 主机变量[65] 0.000000 主机变量[66] 0.000000 主机变量[67] 0.000000 主机变量[68] 0.000000 主机变量[69] 0.000000 主机变量[70] 0.000000 主机变量[71] 0.000000 主机变量[72] 0.000000 主机变量[73] 0.000000 主机变量[74] 0.000000 主机变量[75] 0.000000 主机变量[76] 0.000000 主机变量[77] 0.000000 主机变量[78] 0.000000 主机变量[79] 0.000000 主机变量[80] 0.000000 主机变量[81] 0.000000 主机变量[82] 0.000000 主机变量[83] 0.000000 主机变量[84] 0.000000 主机变量[85] 0.000000 主机变量[86] 0.000000 主机变量[87] 0.000000 主机变量[88] 0.000000 主机变量[89] 0.000000 主机变量[90] 0.000000 主机变量[91] 0.000000 主机变量[92] 0.000000 主机变量[93] 0.000000 主机变量[94] 0.000000 主机变量[95] 0.000000 主机变量[96] 0.000000 主机变量[97] 0.000000 主机变量[98] 0.000000 主机变量[99] 0.000000 主机变量[100] 0.000000 主机变量[101] 0.000000 主机变量[102] 0.000000 主机变量[103] 0.000000 主机变量[104] 0.000000 主机变量[105] 0.000000 主机变量[106] 0.000000 主机变量[107] 0.000000 主机变量[108] 0.000000 主机变量[109] 0.000000 主机变量[110] 0.000000 主机变量[111] 0.000000 主机变量[112] 0.000000 主机变量[113] 0.000000 主机变量[114] 0.000000 主机变量[115] 0.000000 主机变量[116] 0.000000 主机变量[117] 0.000000 主机变量[118] 0.000000 主机变量[119] 0.000000 主机变量[120] 0.000000 主机变量[121] 0.000000 主机变量[122] 0.000000 主机变量[123] 0.000000 主机变量[124] 0.000000 主机变量[125] 0.000000 主机变量[126] 0.000000 主机变量[127] 0.000000 主机变量[128] 18.927368 主机变量[129] 21.664307 主机变量[130] 142.432373 主机变量[131] 1575.141602 主机变量[132] 2901.855957 主机变量[133] 4195.149902 主机变量[134] 5047.758789 主机变量[135] 5450.164063 主机变量[136] 5249.767578 主机变量[137] 4577.365234 主机变量[138] 3352.496094 主机变量[139] 1641.593750 主机变量[140] 352.242188 主机变量[141] 224.824219 主机变量[142] 194.578125 主机变量[143] 178.875000 主机变量[144] 175.148438 主机变量[145] 174.117188 主机变量[146] 172.707031 主机变量[147] 169.578125 主机变量[148] 176.308594 主机变量[149] 181.968750 主机变量[150] 191.507813 主机变量[151] 198.500000 主机变量[152] 206.824219 主机变量[153] 213.273438 主机变量[154] 220.312500 主机变量[155] 218.859375 主机变量[156] 213.941406 主机变量[157] 205.474609 主机变量[158] 190.722656 主机变量[159] 178.414063 主机变量[160] 169.302734 主机变量[161] 3.750366 主机变量[162] 4.333252 主机变量[163] 4.901855 主机变量[164] 5.527466 主机变量[165] 6.201782 主机变量[166] 11.921631 主机变量[167] 14.135376 主机变量[168] 14.885864 主机变量[169] 19.083618 主机变量[170] 21.290283 主机变量[171] 21.415649 主机变量[172] 209.747559 主机变量[173] 580.304932 主机变量[174] 800.949951 主机变量[175] 1119.857422 主机变量[176] 1129.382324 主机变量[177] 1032.616211 主机变量[178] 972.797363 主机变量[179] 915.440918 主机变量[180] 905.890137 主机变量[181] 943.649902 主机变量[182] 1207.445801 主机变量[183]​​ 1345.912109 主机变量[184] 1478.704590 主机变量[185] 1224.895508 主机变量[186] 1105.403564 主机变量[187] 1031.981201 主机变量[188] 914.456421 主机变量[189] 835.127441 主机变量[190] 1320.454102 主机变量[191] 1561.439941 主机变量[192] 1599.149902 主机变量[193] 1912.232910 主机变量[194] 1993.473145 主机变量[195] 1913.377441 主机变量[196] 1784.035645 主机变量[197] 1554.712891 主机变量[198] 1244.698242 host_var[199] 926.668945

结果结束

Dear Scholars,
I am unable to implement mean and var in single kernel call.

Goal: I need to find mean and var of sub matrixs in a matrix. so I wrote following kernels

functions:


    global void kernelMean(d_a, d_mean, ...);
    global void kernelVar(d_a, d_var, ...);
    global void kernelMeanVar(d_a, d_mean, d_var,...);

Issue:
a) if I compute the kernelMean and kernelVar individually it works fine
b) if I want to compute kernelMeanVar in single kernel is does not work.

Below is my code, kindly let me know on possible any errors.

Thanking you in advance.

Regards,
Nagaraju


global void kernelMean(float* device_array, float* device_mean, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
      int block_id = blockIdx.x + gridDim.x * blockIdx.y;
      int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
      int my_rowIdx = thread_id/globalCols ;
      int my_colIdx = thread_id%globalCols ;
      int i,j;
      float temp;
      float sum = 0;
      float sumsq = 0.0;
      float mean;
      float ltotal_elts = (float) (localRows*localCols);

  device_mean[thread_id] = 0;
  if(thread_id <total_num_threads)
  {
    for(i=0; i < localRows ; i++)
    {
      for(j=0 ; j < localCols; j++)
      {
        temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
        sumsq = sumsq + (temp*temp);
        sum = sum + temp;
      }
    }
    mean = sum/ltotal_elts;
    device_mean[thread_id] = mean;

}
}
global void kernelVar(float* device_array,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
int i,j;
float temp;
float sum = 0;
float sumsq = 0;
float mean = 0;
float var = 0;
float ltotal_elts = (float) localRows*localCols;

  device_var[thread_id] = 0;
  if(thread_id < total_num_threads)
  {
    for(i=0; i < localRows ; i++)
    {
      for(j=0 ; j < localCols; j++)
      {
        temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
        sum = sum + temp;
        sumsq = sumsq + (temp*temp);
      }
    }
    mean = sum/ltotal_elts;
    device_var[thread_id]  = (sumsq/ltotal_elts) - (mean*mean);
  }
}

global void kernelMeanVar(float* device_array, float* device_mean,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
int i,j;
float temp;
float sum = 0;
float sumsq = 0.0;
float mean;
float ltotal_elts = (float) (localRows*localCols);
device_mean[thread_id] = 0;
device_var[thread_id] = 0;
if(thread_id < total_num_threads)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sumsq = sumsq + (temp*temp);
sum = sum + temp;
}
}
mean = sum/ltotal_elts;
device_mean[thread_id] = mean;
device_var[thread_id] = (sumsq/ltotal_elts) - (mean*mean);
}
}

Kernel Call Functions


void convertToFloat(float** float_ary, double* double_ary, int num_elts)
{
  for(int i = 0; i < num_elts ; i++)
  {
    (*float_ary)[i] = (float) double_ary[i];
     //printf("float_ary[%d] : %f \n", i, (*float_ary)[i]);
  }
  return;
}
void convertToDouble(double** double_ary, float* float_ary, int num_elts)
{
  for(int i = 0; i < num_elts ; i++)
  {
    (*double_ary)[i] = (double) float_ary[i];
  }
  return;
}
void computeMeanAndVarArray(double* host_array, int num_elts, int globalRows, int globalCols, int localRows, int localCols, int numRowElts, double** mean_ary, double** var_ary)
{
  float* host_array_float;
  float* device_array;
  float* host_mean;
  float* host_var;
  float* device_mean;
  float* device_var;
  double total_bytes =0;
  host_array_float = (float*) malloc (num_elts*sizeof(float));
  convertToFloat(&host_array_float, host_array, num_elts);
  //printf("num_elts %d \n", num_elts);
  cudaMalloc((void**) &device_array, sizeof(float)* num_elts);
  cudaMemset(device_array, 0, sizeof(float)* num_elts);
  cudaMemcpy(device_array, host_array_float,sizeof(float)* num_elts, cudaMemcpyHostToDevice);
  int numBlockThreads = MAX_THREADS_PER_BLOCK;
  int num_blocks = 0;
  int remain_elts = 0;
  int total_num_threads = globalRows * globalCols;
  cudaMalloc((void**) &device_mean, sizeof(float)* total_num_threads);
  cudaMemset(device_mean, 0, sizeof(float)* total_num_threads);
  cudaMalloc((void**) &device_var, sizeof(float)* total_num_threads);
  cudaMemset(device_var, 0, sizeof(float)* total_num_threads);
  num_blocks  =  total_num_threads/numBlockThreads;
  remain_elts =  total_num_threads%numBlockThreads;
  if(remain_elts > 0)
  {
    num_blocks++;
  }
  dim3 gridDim(num_blocks,1);
  dim3 blockDim(numBlockThreads,1);
  //kernelMean<<< gridDim,blockDim >>>(device_array, device_mean,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  //kernelVar<<< gridDim,blockDim >>>(device_array, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  kernelMeanVar<<< gridDim,blockDim >>>(device_array, device_mean, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  host_mean = (float*) malloc( sizeof(float) * total_num_threads);
  memset(host_mean, 0, sizeof(float) * total_num_threads);
  host_var = (float*) malloc( sizeof(float) * total_num_threads);
  memset(host_var, 0, sizeof(float) * total_num_threads);
  cudaThreadSynchronize();
  cudaError_t error = cudaGetLastError();
   //if(error!=cudaSuccess) {
      printf("ERROR: %s\n", cudaGetErrorString(error) );
   //}
  cudaMemcpy(host_mean, device_mean, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
  convertToDouble(mean_ary, host_mean, total_num_threads);
  cudaMemcpy(host_var, device_var, sizeof(float)*total_num_threads,     cudaMemcpyDeviceToHost);
  for(int i = 0 ; i < 300 ; i++)
    printf("host_var[%d] %f \n",i, host_var[i]);
  convertToDouble(var_ary, host_var, total_num_threads);
  cudaFree(device_array);
  cudaFree(device_mean);
  cudaFree(device_var);
  free(host_mean);
  free(host_var);
  free(host_array_float);
}

Results with enabling


 global void kernelMean(d_a, d_mean, ...);
 global void kernelVar(d_a, d_var, ...);

ERROR: no error host_var[0] 4.497070 host_var[1] 5.061768 host_var[2] 5.687500 host_var[3] 6.347534 host_var[4] 6.829102 host_var[5] 12.940308 host_var[6] 14.309937 host_var[7] 15.141113 host_var[8] 18.741577 host_var[9] 21.323608 host_var[10] 21.727417 host_var[11] 192.348389 host_var[12] 579.911621 host_var[13] 800.821045 host_var[14] 1071.960938 host_var[15] 1077.261719 host_var[16] 993.262207 host_var[17] 924.379883 host_var[18] 839.437012 host_var[19] 810.847656 host_var[20] 835.007813 host_var[21] 1124.365723 host_var[22] 1241.685547 host_var[23] 1376.504150 host_var[24] 1196.745850 host_var[25] 1097.473877 host_var[26] 1008.840088 host_var[27] 867.585083 host_var[28] 794.241699 host_var[29] 1322.409790 host_var[30] 1556.029785 host_var[31] 1564.997803 host_var[32] 1870.985840 host_var[33] 1929.829590 host_var[34] 1822.189453 host_var[35] 1662.321777 host_var[36] 1372.886719 host_var[37] 1074.727539 host_var[38] 833.003906 host_var[39] 632.514648 host_var[40] 380.227539 host_var[41] 87.345703 host_var[42] 82.544922 host_var[43] 78.756836 host_var[44] 68.541016 host_var[45] 61.981445 host_var[46] 60.413086 host_var[47] 60.128906 host_var[48] 59.767578 host_var[49] 59.223633 host_var[50] 56.569336 host_var[51] 53.866211 host_var[52] 51.186523 host_var[53] 55.270508 host_var[54] 59.956055 host_var[55] 66.516602 host_var[56] 70.348633 host_var[57] 71.706055 host_var[58] 70.494141 host_var[59] 69.897461 host_var[60] 66.286133 host_var[61] 67.926758 host_var[62] 160.753906 host_var[63] 447.221191 host_var[64] 831.740723 host_var[65] 1076.513672 host_var[66] 1193.666992 host_var[67] 1208.239746 host_var[68] 1126.845947 host_var[69] 948.397461 host_var[70] 669.399414 host_var[71] 340.465576 host_var[72] 67.161865 host_var[73] 7.421082 host_var[74] 5.485626 host_var[75] 5.135620 host_var[76] 3.460419 host_var[77] 3.853577 host_var[78] 5.221100 host_var[79] 5.890381 host_var[80] 7.139618 host_var[81] 7.517609 host_var[82] 6.865875 host_var[83] 5.053909 host_var[84] 2.781616 host_var[85] 2.021912 host_var[86] 2.130417 host_var[87] 3.113586 host_var[88] 4.024399 host_var[89] 4.582413 host_var[90] 4.077118 host_var[91] 3.024384 host_var[92] 2.287506 host_var[93] 1.793579 host_var[94] 1.567474 host_var[95] 1.829895 host_var[96] 2.325928 host_var[97] 3.429993 host_var[98] 3.885559 host_var[99] 3.835602 host_var[100] 5.566406 host_var[101] 8.065582 host_var[102] 18.767456 host_var[103] 35.395599 host_var[104] 64.148407 host_var[105] 125.937866 host_var[106] 176.445618 host_var[107] 216.073059 host_var[108] 272.109985 host_var[109] 307.972412 host_var[110] 289.652344 host_var[111] 238.253662 host_var[112] 178.304932 host_var[113] 116.925049 host_var[114] 74.773926 host_var[115] 61.227295 host_var[116] 55.238525 host_var[117] 55.387451 host_var[118] 49.241699 host_var[119] 38.396240 host_var[120] 28.304932 host_var[121] 20.225342 host_var[122] 18.043457 host_var[123] 21.418457 host_var[124] 26.120117 host_var[125] 25.899414 host_var[126] 26.641602 host_var[127] 23.747437 host_var[128] 18.927368 host_var[129] 21.664307 host_var[130] 142.432373 host_var[131] 1575.141602 host_var[132] 2901.855957 host_var[133] 4195.149902 host_var[134] 5047.758789 host_var[135] 5450.164063 host_var[136] 5249.767578 host_var[137] 4577.365234 host_var[138] 3352.496094 host_var[139] 1641.593750 host_var[140] 352.242188 host_var[141] 224.824219 host_var[142] 194.578125 host_var[143] 178.875000 host_var[144] 175.148438 host_var[145] 174.117188 host_var[146] 172.707031 host_var[147] 169.578125 host_var[148] 176.308594 host_var[149] 181.968750 host_var[150] 191.507813 host_var[151] 198.500000 host_var[152] 206.824219 host_var[153] 213.273438 host_var[154] 220.312500 host_var[155] 218.859375 host_var[156] 213.941406 host_var[157] 205.474609 host_var[158] 190.722656 host_var[159] 178.414063 host_var[160] 169.302734 host_var[161] 3.750366 host_var[162] 4.333252 host_var[163] 4.901855 host_var[164] 5.527466 host_var[165] 6.201782 host_var[166] 11.921631 host_var[167] 14.135376 host_var[168] 14.885864 host_var[169] 19.083618 host_var[170] 21.290283 host_var[171] 21.415649 host_var[172] 209.747559 host_var[173] 580.304932 host_var[174] 800.949951 host_var[175] 1119.857422 host_var[176] 1129.382324 host_var[177] 1032.616211 host_var[178] 972.797363 host_var[179] 915.440918 host_var[180] 905.890137 host_var[181] 943.649902 host_var[182] 1207.445801 host_var[183] 1345.912109 host_var[184] 1478.704590 host_var[185] 1224.895508 host_var[186] 1105.403564 host_var[187] 1031.981201 host_var[188] 914.456421 host_var[189] 835.127441 host_var[190] 1320.454102 host_var[191] 1561.439941 host_var[192] 1599.149902 host_var[193] 1912.232910 host_var[194] 1993.473145 host_var[195] 1913.377441 host_var[196] 1784.035645 host_var[197] 1554.712891 host_var[198] 1244.698242 host_var[199] 926.668945

Reusults with enabling


global void kernelMeanVar(d_a, d_mean, d_var,...);

ERROR: no error host_var[0] 0.000000 host_var[1] 0.000000 host_var[2] 0.000000 host_var[3] 0.000000 host_var[4] 0.000000 host_var[5] 0.000000 host_var[6] 0.000000 host_var[7] 0.000000 host_var[8] 0.000000 host_var[9] 0.000000 host_var[10] 0.000000 host_var[11] 0.000000 host_var[12] 0.000000 host_var[13] 0.000000 host_var[14] 0.000000 host_var[15] 0.000000 host_var[16] 0.000000 host_var[17] 0.000000 host_var[18] 0.000000 host_var[19] 0.000000 host_var[20] 0.000000 host_var[21] 0.000000 host_var[22] 0.000000 host_var[23] 0.000000 host_var[24] 0.000000 host_var[25] 0.000000 host_var[26] 0.000000 host_var[27] 0.000000 host_var[28] 0.000000 host_var[29] 0.000000 host_var[30] 0.000000 host_var[31] 0.000000 host_var[32] 0.000000 host_var[33] 0.000000 host_var[34] 0.000000 host_var[35] 0.000000 host_var[36] 0.000000 host_var[37] 0.000000 host_var[38] 0.000000 host_var[39] 0.000000 host_var[40] 0.000000 host_var[41] 0.000000 host_var[42] 0.000000 host_var[43] 0.000000 host_var[44] 0.000000 host_var[45] 0.000000 host_var[46] 0.000000 host_var[47] 0.000000 host_var[48] 0.000000 host_var[49] 0.000000 host_var[50] 0.000000 host_var[51] 0.000000 host_var[52] 0.000000 host_var[53] 0.000000 host_var[54] 0.000000 host_var[55] 0.000000 host_var[56] 0.000000 host_var[57] 0.000000 host_var[58] 0.000000 host_var[59] 0.000000 host_var[60] 0.000000 host_var[61] 0.000000 host_var[62] 0.000000 host_var[63] 0.000000 host_var[64] 0.000000 host_var[65] 0.000000 host_var[66] 0.000000 host_var[67] 0.000000 host_var[68] 0.000000 host_var[69] 0.000000 host_var[70] 0.000000 host_var[71] 0.000000 host_var[72] 0.000000 host_var[73] 0.000000 host_var[74] 0.000000 host_var[75] 0.000000 host_var[76] 0.000000 host_var[77] 0.000000 host_var[78] 0.000000 host_var[79] 0.000000 host_var[80] 0.000000 host_var[81] 0.000000 host_var[82] 0.000000 host_var[83] 0.000000 host_var[84] 0.000000 host_var[85] 0.000000 host_var[86] 0.000000 host_var[87] 0.000000 host_var[88] 0.000000 host_var[89] 0.000000 host_var[90] 0.000000 host_var[91] 0.000000 host_var[92] 0.000000 host_var[93] 0.000000 host_var[94] 0.000000 host_var[95] 0.000000 host_var[96] 0.000000 host_var[97] 0.000000 host_var[98] 0.000000 host_var[99] 0.000000 host_var[100] 0.000000 host_var[101] 0.000000 host_var[102] 0.000000 host_var[103] 0.000000 host_var[104] 0.000000 host_var[105] 0.000000 host_var[106] 0.000000 host_var[107] 0.000000 host_var[108] 0.000000 host_var[109] 0.000000 host_var[110] 0.000000 host_var[111] 0.000000 host_var[112] 0.000000 host_var[113] 0.000000 host_var[114] 0.000000 host_var[115] 0.000000 host_var[116] 0.000000 host_var[117] 0.000000 host_var[118] 0.000000 host_var[119] 0.000000 host_var[120] 0.000000 host_var[121] 0.000000 host_var[122] 0.000000 host_var[123] 0.000000 host_var[124] 0.000000 host_var[125] 0.000000 host_var[126] 0.000000 host_var[127] 0.000000 host_var[128] 18.927368 host_var[129] 21.664307 host_var[130] 142.432373 host_var[131] 1575.141602 host_var[132] 2901.855957 host_var[133] 4195.149902 host_var[134] 5047.758789 host_var[135] 5450.164063 host_var[136] 5249.767578 host_var[137] 4577.365234 host_var[138] 3352.496094 host_var[139] 1641.593750 host_var[140] 352.242188 host_var[141] 224.824219 host_var[142] 194.578125 host_var[143] 178.875000 host_var[144] 175.148438 host_var[145] 174.117188 host_var[146] 172.707031 host_var[147] 169.578125 host_var[148] 176.308594 host_var[149] 181.968750 host_var[150] 191.507813 host_var[151] 198.500000 host_var[152] 206.824219 host_var[153] 213.273438 host_var[154] 220.312500 host_var[155] 218.859375 host_var[156] 213.941406 host_var[157] 205.474609 host_var[158] 190.722656 host_var[159] 178.414063 host_var[160] 169.302734 host_var[161] 3.750366 host_var[162] 4.333252 host_var[163] 4.901855 host_var[164] 5.527466 host_var[165] 6.201782 host_var[166] 11.921631 host_var[167] 14.135376 host_var[168] 14.885864 host_var[169] 19.083618 host_var[170] 21.290283 host_var[171] 21.415649 host_var[172] 209.747559 host_var[173] 580.304932 host_var[174] 800.949951 host_var[175] 1119.857422 host_var[176] 1129.382324 host_var[177] 1032.616211 host_var[178] 972.797363 host_var[179] 915.440918 host_var[180] 905.890137 host_var[181] 943.649902 host_var[182] 1207.445801 host_var[183] 1345.912109 host_var[184] 1478.704590 host_var[185] 1224.895508 host_var[186] 1105.403564 host_var[187] 1031.981201 host_var[188] 914.456421 host_var[189] 835.127441 host_var[190] 1320.454102 host_var[191] 1561.439941 host_var[192] 1599.149902 host_var[193] 1912.232910 host_var[194] 1993.473145 host_var[195] 1913.377441 host_var[196] 1784.035645 host_var[197] 1554.712891 host_var[198] 1244.698242 host_var[199] 926.668945

End of Results

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

长发绾君心 2024-10-08 06:45:21

建议:

  • 【第一优先】检查CUDA函数的返回值,是否有报错。
  • 通过 cuda-memcheck 运行它。我不确定 globalRows、globalCols、localRows、localCols、num_elts 等之间的关系是什么,但读取越界似乎是问题的候选者。
  • 请记住,如果您不小心,对平方求和可能会很快导致舍入错误。考虑使用运行均值/方差或进行基于树的缩减。

Suggestions:

  • [First priority] Check return values from CUDA functions to see whether any errors are reported.
  • Run this through cuda-memcheck. I'm not sure what the relationship is between globalRows, globalCols, localRows, localCols, num_elts etc. is but reading out-of-bounds seems like a candadite for problems.
  • Remember that summing the squares can lead to rounding errors fairly quickly if you don't take care. Consider using a running mean/variance or doing a tree-based reduction.
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文