无法在单个内核代码中找到平均值和方差
亲爱的学者们,
我无法在单个内核调用中实现mean和var。
目标:我需要找到矩阵中子矩阵的均值和方差。所以我写了以下内核
函数:
global void kernelMean(d_a, d_mean, ...);
global void kernelVar(d_a, d_var, ...);
global void kernelMeanVar(d_a, d_mean, d_var,...);
问题: a)如果我单独计算 kernelMean 和 kernelVar ,它工作得很好 b) 如果我想在单个内核中计算 kernelMeanVar 是行不通的。
以下是我的代码,请让我知道可能的任何错误。
预先感谢您。
问候,
纳加拉朱
global void kernelMean(float* device_array, float* device_mean, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
int i,j;
float temp;
float sum = 0;
float sumsq = 0.0;
float mean;
float ltotal_elts = (float) (localRows*localCols);
device_mean[thread_id] = 0;
if(thread_id <total_num_threads)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sumsq = sumsq + (temp*temp);
sum = sum + temp;
}
}
mean = sum/ltotal_elts;
device_mean[thread_id] = mean;
}
}
全局 void kernelVar(float* device_array,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
整数 i,j;
浮动温度;
浮点总和=0;
浮点总和=0;
浮点数平均值 = 0;
浮动变量 = 0;
浮动 ltotal_elts = (浮动) localRows*localCols;
device_var[thread_id] = 0;
if(thread_id < total_num_threads)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sum = sum + temp;
sumsq = sumsq + (temp*temp);
}
}
mean = sum/ltotal_elts;
device_var[thread_id] = (sumsq/ltotal_elts) - (mean*mean);
}
}
全局 void kernelMeanVar(float* device_array, float* device_mean,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
整数 i,j;
浮动温度;
浮点总和=0;
浮点数总和 = 0.0;
浮动平均值;
浮动 ltotal_elts = (浮动) (localRows*localCols);
device_mean[线程id] = 0;
device_var[线程id] = 0;
if(thread_id < 总线程数)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sumsq = sumsq + (temp*temp);
总和=总和+温度;
}
}
平均值=总和/ltotal_elts;
device_mean[thread_id] = 平均值;
device_var[thread_id] = (sumsq/ltotal_elts) - (平均值*平均值);
}
}
启用后的内核调用函数
void convertToFloat(float** float_ary, double* double_ary, int num_elts)
{
for(int i = 0; i < num_elts ; i++)
{
(*float_ary)[i] = (float) double_ary[i];
//printf("float_ary[%d] : %f \n", i, (*float_ary)[i]);
}
return;
}
void convertToDouble(double** double_ary, float* float_ary, int num_elts)
{
for(int i = 0; i < num_elts ; i++)
{
(*double_ary)[i] = (double) float_ary[i];
}
return;
}
void computeMeanAndVarArray(double* host_array, int num_elts, int globalRows, int globalCols, int localRows, int localCols, int numRowElts, double** mean_ary, double** var_ary)
{
float* host_array_float;
float* device_array;
float* host_mean;
float* host_var;
float* device_mean;
float* device_var;
double total_bytes =0;
host_array_float = (float*) malloc (num_elts*sizeof(float));
convertToFloat(&host_array_float, host_array, num_elts);
//printf("num_elts %d \n", num_elts);
cudaMalloc((void**) &device_array, sizeof(float)* num_elts);
cudaMemset(device_array, 0, sizeof(float)* num_elts);
cudaMemcpy(device_array, host_array_float,sizeof(float)* num_elts, cudaMemcpyHostToDevice);
int numBlockThreads = MAX_THREADS_PER_BLOCK;
int num_blocks = 0;
int remain_elts = 0;
int total_num_threads = globalRows * globalCols;
cudaMalloc((void**) &device_mean, sizeof(float)* total_num_threads);
cudaMemset(device_mean, 0, sizeof(float)* total_num_threads);
cudaMalloc((void**) &device_var, sizeof(float)* total_num_threads);
cudaMemset(device_var, 0, sizeof(float)* total_num_threads);
num_blocks = total_num_threads/numBlockThreads;
remain_elts = total_num_threads%numBlockThreads;
if(remain_elts > 0)
{
num_blocks++;
}
dim3 gridDim(num_blocks,1);
dim3 blockDim(numBlockThreads,1);
//kernelMean<<< gridDim,blockDim >>>(device_array, device_mean,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
//kernelVar<<< gridDim,blockDim >>>(device_array, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
kernelMeanVar<<< gridDim,blockDim >>>(device_array, device_mean, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
host_mean = (float*) malloc( sizeof(float) * total_num_threads);
memset(host_mean, 0, sizeof(float) * total_num_threads);
host_var = (float*) malloc( sizeof(float) * total_num_threads);
memset(host_var, 0, sizeof(float) * total_num_threads);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
//if(error!=cudaSuccess) {
printf("ERROR: %s\n", cudaGetErrorString(error) );
//}
cudaMemcpy(host_mean, device_mean, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
convertToDouble(mean_ary, host_mean, total_num_threads);
cudaMemcpy(host_var, device_var, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
for(int i = 0 ; i < 300 ; i++)
printf("host_var[%d] %f \n",i, host_var[i]);
convertToDouble(var_ary, host_var, total_num_threads);
cudaFree(device_array);
cudaFree(device_mean);
cudaFree(device_var);
free(host_mean);
free(host_var);
free(host_array_float);
}
结果
global void kernelMean(d_a, d_mean, ...);
global void kernelVar(d_a, d_var, ...);
错误:没有错误
主机变量[0] 4.497070
主机变量[1] 5.061768
主机变量[2] 5.687500
主机变量[3] 6.347534
主机变量[4] 6.829102
主机变量[5] 12.940308
主机变量[6] 14.309937
主机变量[7] 15.141113
主机变量[8] 18.741577
主机变量[9] 21.323608
主机变量[10] 21.727417
主机变量[11] 192.348389
主机变量[12] 579.911621
主机变量[13] 800.821045
主机变量[14] 1071.960938
主机变量[15] 1077.261719
主机变量[16] 993.262207
主机变量[17] 924.379883
主机变量[18] 839.437012
主机变量[19] 810.847656
主机变量[20] 835.007813
主机变量[21] 1124.365723
主机变量[22] 1241.685547
主机变量[23] 1376.504150
主机变量[24] 1196.745850
主机变量[25] 1097.473877
主机变量[26] 1008.840088
主机变量[27] 867.585083
主机变量[28] 794.241699
主机变量[29] 1322.409790
主机变量[30] 1556.029785
主机变量[31] 1564.997803
主机变量[32] 1870.985840
主机变量[33] 1929.829590
主机变量[34] 1822.189453
主机变量[35] 1662.321777
主机变量[36] 1372.886719
主机变量[37] 1074.727539
主机变量[38] 833.003906
主机变量[39] 632.514648
主机变量[40] 380.227539
主机变量[41] 87.345703
主机变量[42] 82.544922
主机变量[43] 78.756836
主机变量[44] 68.541016
主机变量[45] 61.981445
主机变量[46] 60.413086
主机变量[47] 60.128906
主机变量[48] 59.767578
主机变量[49] 59.223633
主机变量[50] 56.569336
主机变量[51] 53.866211
主机变量[52] 51.186523
主机变量[53] 55.270508
主机变量[54] 59.956055
主机变量[55] 66.516602
主机变量[56] 70.348633
主机变量[57] 71.706055
主机变量[58] 70.494141
主机变量[59] 69.897461
主机变量[60] 66.286133
主机变量[61] 67.926758
主机变量[62] 160.753906
主机变量[63] 447.221191
主机变量[64] 831.740723
主机变量[65] 1076.513672
主机变量[66] 1193.666992
主机变量[67] 1208.239746
主机变量[68] 1126.845947
主机变量[69] 948.397461
主机变量[70] 669.399414
主机变量[71] 340.465576
主机变量[72] 67.161865
主机变量[73] 7.421082
主机变量[74] 5.485626
主机变量[75] 5.135620
主机变量[76] 3.460419
主机变量[77] 3.853577
主机变量[78] 5.221100
主机变量[79] 5.890381
主机变量[80] 7.139618
主机变量[81] 7.517609
主机变量[82] 6.865875
主机变量[83] 5.053909
主机变量[84] 2.781616
主机变量[85] 2.021912
主机变量[86] 2.130417
主机变量[87] 3.113586
主机变量[88] 4.024399
主机变量[89] 4.582413
主机变量[90] 4.077118
主机变量[91] 3.024384
主机变量[92] 2.287506
主机变量[93] 1.793579
主机变量[94] 1.567474
主机变量[95] 1.829895
主机变量[96] 2.325928
主机变量[97] 3.429993
主机变量[98] 3.885559
主机变量[99] 3.835602
主机变量[100] 5.566406
主机变量[101] 8.065582
主机变量[102] 18.767456
主机变量[103] 35.395599
主机变量[104] 64.148407
主机变量[105] 125.937866
主机变量[106] 176.445618
主机变量[107] 216.073059
主机变量[108] 272.109985
主机变量[109] 307.972412
主机变量[110] 289.652344
主机变量[111] 238.253662
主机变量[112] 178.304932
主机变量[113] 116.925049
主机变量[114] 74.773926
主机变量[115] 61.227295
主机变量[116] 55.238525
主机变量[117] 55.387451
主机变量[118] 49.241699
主机变量[119] 38.396240
主机变量[120] 28.304932
主机变量[121] 20.225342
主机变量[122] 18.043457
主机变量[123] 21.418457
主机变量[124] 26.120117
主机变量[125] 25.899414
主机变量[126] 26.641602
主机变量[127] 23.747437
主机变量[128] 18.927368
主机变量[129] 21.664307
主机变量[130] 142.432373
主机变量[131] 1575.141602
主机变量[132] 2901.855957
主机变量[133] 4195.149902
主机变量[134] 5047.758789
主机变量[135] 5450.164063
主机变量[136] 5249.767578
主机变量[137] 4577.365234
主机变量[138] 3352.496094
主机变量[139] 1641.593750
主机变量[140] 352.242188
主机变量[141] 224.824219
主机变量[142] 194.578125
主机变量[143] 178.875000
主机变量[144] 175.148438
主机变量[145] 174.117188
主机变量[146] 172.707031
主机变量[147] 169.578125
主机变量[148] 176.308594
主机变量[149] 181.968750
主机变量[150] 191.507813
主机变量[151] 198.500000
主机变量[152] 206.824219
主机变量[153] 213.273438
主机变量[154] 220.312500
主机变量[155] 218.859375
主机变量[156] 213.941406
主机变量[157] 205.474609
主机变量[158] 190.722656
主机变量[159] 178.414063
主机变量[160] 169.302734
主机变量[161] 3.750366
主机变量[162] 4.333252
主机变量[163] 4.901855
主机变量[164] 5.527466
主机变量[165] 6.201782
主机变量[166] 11.921631
主机变量[167] 14.135376
主机变量[168] 14.885864
主机变量[169] 19.083618
主机变量[170] 21.290283
主机变量[171] 21.415649
主机变量[172] 209.747559
主机变量[173] 580.304932
主机变量[174] 800.949951
主机变量[175] 1119.857422
主机变量[176] 1129.382324
主机变量[177] 1032.616211
主机变量[178] 972.797363
主机变量[179] 915.440918
主机变量[180] 905.890137
主机变量[181] 943.649902
主机变量[182] 1207.445801
主机变量[183] 1345.912109
主机变量[184] 1478.704590
主机变量[185] 1224.895508
主机变量[186] 1105.403564
主机变量[187] 1031.981201
主机变量[188] 914.456421
主机变量[189] 835.127441
主机变量[190] 1320.454102
主机变量[191] 1561.439941
主机变量[192] 1599.149902
主机变量[193] 1912.232910
主机变量[194] 1993.473145
主机变量[195] 1913.377441
主机变量[196] 1784.035645
主机变量[197] 1554.712891
主机变量[198] 1244.698242
host_var[199] 926.668945
启用结果
global void kernelMeanVar(d_a, d_mean, d_var,...);
错误:没有错误
主机变量[0] 0.000000
主机变量[1] 0.000000
主机变量[2] 0.000000
主机变量[3] 0.000000
主机变量[4] 0.000000
主机变量[5] 0.000000
主机变量[6] 0.000000
主机变量[7] 0.000000
主机变量[8] 0.000000
主机变量[9] 0.000000
主机变量[10] 0.000000
主机变量[11] 0.000000
主机变量[12] 0.000000
主机变量[13] 0.000000
主机变量[14] 0.000000
主机变量[15] 0.000000
主机变量[16] 0.000000
主机变量[17] 0.000000
主机变量[18] 0.000000
主机变量[19] 0.000000
主机变量[20] 0.000000
主机变量[21] 0.000000
主机变量[22] 0.000000
主机变量[23] 0.000000
主机变量[24] 0.000000
主机变量[25] 0.000000
主机变量[26] 0.000000
主机变量[27] 0.000000
主机变量[28] 0.000000
主机变量[29] 0.000000
主机变量[30] 0.000000
主机变量[31] 0.000000
主机变量[32] 0.000000
主机变量[33] 0.000000
主机变量[34] 0.000000
主机变量[35] 0.000000
主机变量[36] 0.000000
主机变量[37] 0.000000
主机变量[38] 0.000000
主机变量[39] 0.000000
主机变量[40] 0.000000
主机变量[41] 0.000000
主机变量[42] 0.000000
主机变量[43] 0.000000
主机变量[44] 0.000000
主机变量[45] 0.000000
主机变量[46] 0.000000
主机变量[47] 0.000000
主机变量[48] 0.000000
主机变量[49] 0.000000
主机变量[50] 0.000000
主机变量[51] 0.000000
主机变量[52] 0.000000
主机变量[53] 0.000000
主机变量[54] 0.000000
主机变量[55] 0.000000
主机变量[56] 0.000000
主机变量[57] 0.000000
主机变量[58] 0.000000
主机变量[59] 0.000000
主机变量[60] 0.000000
主机变量[61] 0.000000
主机变量[62] 0.000000
主机变量[63] 0.000000
主机变量[64] 0.000000
主机变量[65] 0.000000
主机变量[66] 0.000000
主机变量[67] 0.000000
主机变量[68] 0.000000
主机变量[69] 0.000000
主机变量[70] 0.000000
主机变量[71] 0.000000
主机变量[72] 0.000000
主机变量[73] 0.000000
主机变量[74] 0.000000
主机变量[75] 0.000000
主机变量[76] 0.000000
主机变量[77] 0.000000
主机变量[78] 0.000000
主机变量[79] 0.000000
主机变量[80] 0.000000
主机变量[81] 0.000000
主机变量[82] 0.000000
主机变量[83] 0.000000
主机变量[84] 0.000000
主机变量[85] 0.000000
主机变量[86] 0.000000
主机变量[87] 0.000000
主机变量[88] 0.000000
主机变量[89] 0.000000
主机变量[90] 0.000000
主机变量[91] 0.000000
主机变量[92] 0.000000
主机变量[93] 0.000000
主机变量[94] 0.000000
主机变量[95] 0.000000
主机变量[96] 0.000000
主机变量[97] 0.000000
主机变量[98] 0.000000
主机变量[99] 0.000000
主机变量[100] 0.000000
主机变量[101] 0.000000
主机变量[102] 0.000000
主机变量[103] 0.000000
主机变量[104] 0.000000
主机变量[105] 0.000000
主机变量[106] 0.000000
主机变量[107] 0.000000
主机变量[108] 0.000000
主机变量[109] 0.000000
主机变量[110] 0.000000
主机变量[111] 0.000000
主机变量[112] 0.000000
主机变量[113] 0.000000
主机变量[114] 0.000000
主机变量[115] 0.000000
主机变量[116] 0.000000
主机变量[117] 0.000000
主机变量[118] 0.000000
主机变量[119] 0.000000
主机变量[120] 0.000000
主机变量[121] 0.000000
主机变量[122] 0.000000
主机变量[123] 0.000000
主机变量[124] 0.000000
主机变量[125] 0.000000
主机变量[126] 0.000000
主机变量[127] 0.000000
主机变量[128] 18.927368
主机变量[129] 21.664307
主机变量[130] 142.432373
主机变量[131] 1575.141602
主机变量[132] 2901.855957
主机变量[133] 4195.149902
主机变量[134] 5047.758789
主机变量[135] 5450.164063
主机变量[136] 5249.767578
主机变量[137] 4577.365234
主机变量[138] 3352.496094
主机变量[139] 1641.593750
主机变量[140] 352.242188
主机变量[141] 224.824219
主机变量[142] 194.578125
主机变量[143] 178.875000
主机变量[144] 175.148438
主机变量[145] 174.117188
主机变量[146] 172.707031
主机变量[147] 169.578125
主机变量[148] 176.308594
主机变量[149] 181.968750
主机变量[150] 191.507813
主机变量[151] 198.500000
主机变量[152] 206.824219
主机变量[153] 213.273438
主机变量[154] 220.312500
主机变量[155] 218.859375
主机变量[156] 213.941406
主机变量[157] 205.474609
主机变量[158] 190.722656
主机变量[159] 178.414063
主机变量[160] 169.302734
主机变量[161] 3.750366
主机变量[162] 4.333252
主机变量[163] 4.901855
主机变量[164] 5.527466
主机变量[165] 6.201782
主机变量[166] 11.921631
主机变量[167] 14.135376
主机变量[168] 14.885864
主机变量[169] 19.083618
主机变量[170] 21.290283
主机变量[171] 21.415649
主机变量[172] 209.747559
主机变量[173] 580.304932
主机变量[174] 800.949951
主机变量[175] 1119.857422
主机变量[176] 1129.382324
主机变量[177] 1032.616211
主机变量[178] 972.797363
主机变量[179] 915.440918
主机变量[180] 905.890137
主机变量[181] 943.649902
主机变量[182] 1207.445801
主机变量[183] 1345.912109
主机变量[184] 1478.704590
主机变量[185] 1224.895508
主机变量[186] 1105.403564
主机变量[187] 1031.981201
主机变量[188] 914.456421
主机变量[189] 835.127441
主机变量[190] 1320.454102
主机变量[191] 1561.439941
主机变量[192] 1599.149902
主机变量[193] 1912.232910
主机变量[194] 1993.473145
主机变量[195] 1913.377441
主机变量[196] 1784.035645
主机变量[197] 1554.712891
主机变量[198] 1244.698242
host_var[199] 926.668945
结果结束
Dear Scholars,
I am unable to implement mean and var in single kernel call.
Goal: I need to find mean and var of sub matrixs in a matrix. so I wrote following kernels
functions:
global void kernelMean(d_a, d_mean, ...);
global void kernelVar(d_a, d_var, ...);
global void kernelMeanVar(d_a, d_mean, d_var,...);
Issue:
a) if I compute the kernelMean and kernelVar individually it works fine
b) if I want to compute kernelMeanVar in single kernel is does not work.
Below is my code, kindly let me know on possible any errors.
Thanking you in advance.
Regards,
Nagaraju
global void kernelMean(float* device_array, float* device_mean, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
int i,j;
float temp;
float sum = 0;
float sumsq = 0.0;
float mean;
float ltotal_elts = (float) (localRows*localCols);
device_mean[thread_id] = 0;
if(thread_id <total_num_threads)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sumsq = sumsq + (temp*temp);
sum = sum + temp;
}
}
mean = sum/ltotal_elts;
device_mean[thread_id] = mean;
}
}
global void kernelVar(float* device_array,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
int i,j;
float temp;
float sum = 0;
float sumsq = 0;
float mean = 0;
float var = 0;
float ltotal_elts = (float) localRows*localCols;
device_var[thread_id] = 0;
if(thread_id < total_num_threads)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sum = sum + temp;
sumsq = sumsq + (temp*temp);
}
}
mean = sum/ltotal_elts;
device_var[thread_id] = (sumsq/ltotal_elts) - (mean*mean);
}
}
global void kernelMeanVar(float* device_array, float* device_mean,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
int i,j;
float temp;
float sum = 0;
float sumsq = 0.0;
float mean;
float ltotal_elts = (float) (localRows*localCols);
device_mean[thread_id] = 0;
device_var[thread_id] = 0;
if(thread_id < total_num_threads)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sumsq = sumsq + (temp*temp);
sum = sum + temp;
}
}
mean = sum/ltotal_elts;
device_mean[thread_id] = mean;
device_var[thread_id] = (sumsq/ltotal_elts) - (mean*mean);
}
}
Kernel Call Functions
void convertToFloat(float** float_ary, double* double_ary, int num_elts)
{
for(int i = 0; i < num_elts ; i++)
{
(*float_ary)[i] = (float) double_ary[i];
//printf("float_ary[%d] : %f \n", i, (*float_ary)[i]);
}
return;
}
void convertToDouble(double** double_ary, float* float_ary, int num_elts)
{
for(int i = 0; i < num_elts ; i++)
{
(*double_ary)[i] = (double) float_ary[i];
}
return;
}
void computeMeanAndVarArray(double* host_array, int num_elts, int globalRows, int globalCols, int localRows, int localCols, int numRowElts, double** mean_ary, double** var_ary)
{
float* host_array_float;
float* device_array;
float* host_mean;
float* host_var;
float* device_mean;
float* device_var;
double total_bytes =0;
host_array_float = (float*) malloc (num_elts*sizeof(float));
convertToFloat(&host_array_float, host_array, num_elts);
//printf("num_elts %d \n", num_elts);
cudaMalloc((void**) &device_array, sizeof(float)* num_elts);
cudaMemset(device_array, 0, sizeof(float)* num_elts);
cudaMemcpy(device_array, host_array_float,sizeof(float)* num_elts, cudaMemcpyHostToDevice);
int numBlockThreads = MAX_THREADS_PER_BLOCK;
int num_blocks = 0;
int remain_elts = 0;
int total_num_threads = globalRows * globalCols;
cudaMalloc((void**) &device_mean, sizeof(float)* total_num_threads);
cudaMemset(device_mean, 0, sizeof(float)* total_num_threads);
cudaMalloc((void**) &device_var, sizeof(float)* total_num_threads);
cudaMemset(device_var, 0, sizeof(float)* total_num_threads);
num_blocks = total_num_threads/numBlockThreads;
remain_elts = total_num_threads%numBlockThreads;
if(remain_elts > 0)
{
num_blocks++;
}
dim3 gridDim(num_blocks,1);
dim3 blockDim(numBlockThreads,1);
//kernelMean<<< gridDim,blockDim >>>(device_array, device_mean,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
//kernelVar<<< gridDim,blockDim >>>(device_array, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
kernelMeanVar<<< gridDim,blockDim >>>(device_array, device_mean, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
host_mean = (float*) malloc( sizeof(float) * total_num_threads);
memset(host_mean, 0, sizeof(float) * total_num_threads);
host_var = (float*) malloc( sizeof(float) * total_num_threads);
memset(host_var, 0, sizeof(float) * total_num_threads);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
//if(error!=cudaSuccess) {
printf("ERROR: %s\n", cudaGetErrorString(error) );
//}
cudaMemcpy(host_mean, device_mean, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
convertToDouble(mean_ary, host_mean, total_num_threads);
cudaMemcpy(host_var, device_var, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
for(int i = 0 ; i < 300 ; i++)
printf("host_var[%d] %f \n",i, host_var[i]);
convertToDouble(var_ary, host_var, total_num_threads);
cudaFree(device_array);
cudaFree(device_mean);
cudaFree(device_var);
free(host_mean);
free(host_var);
free(host_array_float);
}
Results with enabling
global void kernelMean(d_a, d_mean, ...);
global void kernelVar(d_a, d_var, ...);
ERROR: no error
host_var[0] 4.497070
host_var[1] 5.061768
host_var[2] 5.687500
host_var[3] 6.347534
host_var[4] 6.829102
host_var[5] 12.940308
host_var[6] 14.309937
host_var[7] 15.141113
host_var[8] 18.741577
host_var[9] 21.323608
host_var[10] 21.727417
host_var[11] 192.348389
host_var[12] 579.911621
host_var[13] 800.821045
host_var[14] 1071.960938
host_var[15] 1077.261719
host_var[16] 993.262207
host_var[17] 924.379883
host_var[18] 839.437012
host_var[19] 810.847656
host_var[20] 835.007813
host_var[21] 1124.365723
host_var[22] 1241.685547
host_var[23] 1376.504150
host_var[24] 1196.745850
host_var[25] 1097.473877
host_var[26] 1008.840088
host_var[27] 867.585083
host_var[28] 794.241699
host_var[29] 1322.409790
host_var[30] 1556.029785
host_var[31] 1564.997803
host_var[32] 1870.985840
host_var[33] 1929.829590
host_var[34] 1822.189453
host_var[35] 1662.321777
host_var[36] 1372.886719
host_var[37] 1074.727539
host_var[38] 833.003906
host_var[39] 632.514648
host_var[40] 380.227539
host_var[41] 87.345703
host_var[42] 82.544922
host_var[43] 78.756836
host_var[44] 68.541016
host_var[45] 61.981445
host_var[46] 60.413086
host_var[47] 60.128906
host_var[48] 59.767578
host_var[49] 59.223633
host_var[50] 56.569336
host_var[51] 53.866211
host_var[52] 51.186523
host_var[53] 55.270508
host_var[54] 59.956055
host_var[55] 66.516602
host_var[56] 70.348633
host_var[57] 71.706055
host_var[58] 70.494141
host_var[59] 69.897461
host_var[60] 66.286133
host_var[61] 67.926758
host_var[62] 160.753906
host_var[63] 447.221191
host_var[64] 831.740723
host_var[65] 1076.513672
host_var[66] 1193.666992
host_var[67] 1208.239746
host_var[68] 1126.845947
host_var[69] 948.397461
host_var[70] 669.399414
host_var[71] 340.465576
host_var[72] 67.161865
host_var[73] 7.421082
host_var[74] 5.485626
host_var[75] 5.135620
host_var[76] 3.460419
host_var[77] 3.853577
host_var[78] 5.221100
host_var[79] 5.890381
host_var[80] 7.139618
host_var[81] 7.517609
host_var[82] 6.865875
host_var[83] 5.053909
host_var[84] 2.781616
host_var[85] 2.021912
host_var[86] 2.130417
host_var[87] 3.113586
host_var[88] 4.024399
host_var[89] 4.582413
host_var[90] 4.077118
host_var[91] 3.024384
host_var[92] 2.287506
host_var[93] 1.793579
host_var[94] 1.567474
host_var[95] 1.829895
host_var[96] 2.325928
host_var[97] 3.429993
host_var[98] 3.885559
host_var[99] 3.835602
host_var[100] 5.566406
host_var[101] 8.065582
host_var[102] 18.767456
host_var[103] 35.395599
host_var[104] 64.148407
host_var[105] 125.937866
host_var[106] 176.445618
host_var[107] 216.073059
host_var[108] 272.109985
host_var[109] 307.972412
host_var[110] 289.652344
host_var[111] 238.253662
host_var[112] 178.304932
host_var[113] 116.925049
host_var[114] 74.773926
host_var[115] 61.227295
host_var[116] 55.238525
host_var[117] 55.387451
host_var[118] 49.241699
host_var[119] 38.396240
host_var[120] 28.304932
host_var[121] 20.225342
host_var[122] 18.043457
host_var[123] 21.418457
host_var[124] 26.120117
host_var[125] 25.899414
host_var[126] 26.641602
host_var[127] 23.747437
host_var[128] 18.927368
host_var[129] 21.664307
host_var[130] 142.432373
host_var[131] 1575.141602
host_var[132] 2901.855957
host_var[133] 4195.149902
host_var[134] 5047.758789
host_var[135] 5450.164063
host_var[136] 5249.767578
host_var[137] 4577.365234
host_var[138] 3352.496094
host_var[139] 1641.593750
host_var[140] 352.242188
host_var[141] 224.824219
host_var[142] 194.578125
host_var[143] 178.875000
host_var[144] 175.148438
host_var[145] 174.117188
host_var[146] 172.707031
host_var[147] 169.578125
host_var[148] 176.308594
host_var[149] 181.968750
host_var[150] 191.507813
host_var[151] 198.500000
host_var[152] 206.824219
host_var[153] 213.273438
host_var[154] 220.312500
host_var[155] 218.859375
host_var[156] 213.941406
host_var[157] 205.474609
host_var[158] 190.722656
host_var[159] 178.414063
host_var[160] 169.302734
host_var[161] 3.750366
host_var[162] 4.333252
host_var[163] 4.901855
host_var[164] 5.527466
host_var[165] 6.201782
host_var[166] 11.921631
host_var[167] 14.135376
host_var[168] 14.885864
host_var[169] 19.083618
host_var[170] 21.290283
host_var[171] 21.415649
host_var[172] 209.747559
host_var[173] 580.304932
host_var[174] 800.949951
host_var[175] 1119.857422
host_var[176] 1129.382324
host_var[177] 1032.616211
host_var[178] 972.797363
host_var[179] 915.440918
host_var[180] 905.890137
host_var[181] 943.649902
host_var[182] 1207.445801
host_var[183] 1345.912109
host_var[184] 1478.704590
host_var[185] 1224.895508
host_var[186] 1105.403564
host_var[187] 1031.981201
host_var[188] 914.456421
host_var[189] 835.127441
host_var[190] 1320.454102
host_var[191] 1561.439941
host_var[192] 1599.149902
host_var[193] 1912.232910
host_var[194] 1993.473145
host_var[195] 1913.377441
host_var[196] 1784.035645
host_var[197] 1554.712891
host_var[198] 1244.698242
host_var[199] 926.668945
Reusults with enabling
global void kernelMeanVar(d_a, d_mean, d_var,...);
ERROR: no error
host_var[0] 0.000000
host_var[1] 0.000000
host_var[2] 0.000000
host_var[3] 0.000000
host_var[4] 0.000000
host_var[5] 0.000000
host_var[6] 0.000000
host_var[7] 0.000000
host_var[8] 0.000000
host_var[9] 0.000000
host_var[10] 0.000000
host_var[11] 0.000000
host_var[12] 0.000000
host_var[13] 0.000000
host_var[14] 0.000000
host_var[15] 0.000000
host_var[16] 0.000000
host_var[17] 0.000000
host_var[18] 0.000000
host_var[19] 0.000000
host_var[20] 0.000000
host_var[21] 0.000000
host_var[22] 0.000000
host_var[23] 0.000000
host_var[24] 0.000000
host_var[25] 0.000000
host_var[26] 0.000000
host_var[27] 0.000000
host_var[28] 0.000000
host_var[29] 0.000000
host_var[30] 0.000000
host_var[31] 0.000000
host_var[32] 0.000000
host_var[33] 0.000000
host_var[34] 0.000000
host_var[35] 0.000000
host_var[36] 0.000000
host_var[37] 0.000000
host_var[38] 0.000000
host_var[39] 0.000000
host_var[40] 0.000000
host_var[41] 0.000000
host_var[42] 0.000000
host_var[43] 0.000000
host_var[44] 0.000000
host_var[45] 0.000000
host_var[46] 0.000000
host_var[47] 0.000000
host_var[48] 0.000000
host_var[49] 0.000000
host_var[50] 0.000000
host_var[51] 0.000000
host_var[52] 0.000000
host_var[53] 0.000000
host_var[54] 0.000000
host_var[55] 0.000000
host_var[56] 0.000000
host_var[57] 0.000000
host_var[58] 0.000000
host_var[59] 0.000000
host_var[60] 0.000000
host_var[61] 0.000000
host_var[62] 0.000000
host_var[63] 0.000000
host_var[64] 0.000000
host_var[65] 0.000000
host_var[66] 0.000000
host_var[67] 0.000000
host_var[68] 0.000000
host_var[69] 0.000000
host_var[70] 0.000000
host_var[71] 0.000000
host_var[72] 0.000000
host_var[73] 0.000000
host_var[74] 0.000000
host_var[75] 0.000000
host_var[76] 0.000000
host_var[77] 0.000000
host_var[78] 0.000000
host_var[79] 0.000000
host_var[80] 0.000000
host_var[81] 0.000000
host_var[82] 0.000000
host_var[83] 0.000000
host_var[84] 0.000000
host_var[85] 0.000000
host_var[86] 0.000000
host_var[87] 0.000000
host_var[88] 0.000000
host_var[89] 0.000000
host_var[90] 0.000000
host_var[91] 0.000000
host_var[92] 0.000000
host_var[93] 0.000000
host_var[94] 0.000000
host_var[95] 0.000000
host_var[96] 0.000000
host_var[97] 0.000000
host_var[98] 0.000000
host_var[99] 0.000000
host_var[100] 0.000000
host_var[101] 0.000000
host_var[102] 0.000000
host_var[103] 0.000000
host_var[104] 0.000000
host_var[105] 0.000000
host_var[106] 0.000000
host_var[107] 0.000000
host_var[108] 0.000000
host_var[109] 0.000000
host_var[110] 0.000000
host_var[111] 0.000000
host_var[112] 0.000000
host_var[113] 0.000000
host_var[114] 0.000000
host_var[115] 0.000000
host_var[116] 0.000000
host_var[117] 0.000000
host_var[118] 0.000000
host_var[119] 0.000000
host_var[120] 0.000000
host_var[121] 0.000000
host_var[122] 0.000000
host_var[123] 0.000000
host_var[124] 0.000000
host_var[125] 0.000000
host_var[126] 0.000000
host_var[127] 0.000000
host_var[128] 18.927368
host_var[129] 21.664307
host_var[130] 142.432373
host_var[131] 1575.141602
host_var[132] 2901.855957
host_var[133] 4195.149902
host_var[134] 5047.758789
host_var[135] 5450.164063
host_var[136] 5249.767578
host_var[137] 4577.365234
host_var[138] 3352.496094
host_var[139] 1641.593750
host_var[140] 352.242188
host_var[141] 224.824219
host_var[142] 194.578125
host_var[143] 178.875000
host_var[144] 175.148438
host_var[145] 174.117188
host_var[146] 172.707031
host_var[147] 169.578125
host_var[148] 176.308594
host_var[149] 181.968750
host_var[150] 191.507813
host_var[151] 198.500000
host_var[152] 206.824219
host_var[153] 213.273438
host_var[154] 220.312500
host_var[155] 218.859375
host_var[156] 213.941406
host_var[157] 205.474609
host_var[158] 190.722656
host_var[159] 178.414063
host_var[160] 169.302734
host_var[161] 3.750366
host_var[162] 4.333252
host_var[163] 4.901855
host_var[164] 5.527466
host_var[165] 6.201782
host_var[166] 11.921631
host_var[167] 14.135376
host_var[168] 14.885864
host_var[169] 19.083618
host_var[170] 21.290283
host_var[171] 21.415649
host_var[172] 209.747559
host_var[173] 580.304932
host_var[174] 800.949951
host_var[175] 1119.857422
host_var[176] 1129.382324
host_var[177] 1032.616211
host_var[178] 972.797363
host_var[179] 915.440918
host_var[180] 905.890137
host_var[181] 943.649902
host_var[182] 1207.445801
host_var[183] 1345.912109
host_var[184] 1478.704590
host_var[185] 1224.895508
host_var[186] 1105.403564
host_var[187] 1031.981201
host_var[188] 914.456421
host_var[189] 835.127441
host_var[190] 1320.454102
host_var[191] 1561.439941
host_var[192] 1599.149902
host_var[193] 1912.232910
host_var[194] 1993.473145
host_var[195] 1913.377441
host_var[196] 1784.035645
host_var[197] 1554.712891
host_var[198] 1244.698242
host_var[199] 926.668945
End of Results
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
建议:
Suggestions: