//R: this kernel should be used for small power arrays that fit in local memory comletely __kernel void PC_find_pulse_local_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float* PoT,__local float* tmp_PoT,__global uint4* result_flag, __global float* PulsePoT_average) { //if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 1024*1024/fft_len; int ul_PoT = get_global_id(0); int y = get_global_id(1); int z=get_local_id(0); int TOffset1 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len); __local float* tmp_pot = tmp_PoT +z; //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float avg;//,maxp=0;//, snr=0, fthresh=0; float tmp_max; avg=PulsePoT_average[ul_PoT+y*fft_len]; // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // //int num_adds = pass; for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; switch(num_adds) { case 3: lastP = (PulsePotLen*2)/3; firstP = (PulsePotLen*1)/2; break; case 4: lastP = (PulsePotLen*3)/4; firstP = (PulsePotLen*3)/5; break; case 5: lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; break; } int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (12/num_adds_minus1); perdiv = num_adds_minus1; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct_f(di, num_adds, 0, PulseMax, t_funct_cache_fp) * avg; switch(num_adds) { case 3: tmp_max = sumtop3_local(fp_PulsePot, tmp_pot, di, tmp0, tmp1,fft_len); break; case 4: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp_max = sumtop4_local(fp_PulsePot, tmp_pot, di, tmp0, tmp1, tmp2,fft_len); break; case 5: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5_local(fp_PulsePot, tmp_pot, di, tmp0, tmp1, tmp2, tmp3,fft_len); break; } if (tmp_max>dis_thresh) { // unscale for reporting tmp_max /= num_adds; cur_thresh = (dis_thresh / num_adds - avg) * rcfg_dis_thresh + avg; float _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr > best_pulse_score ) { result_flag[0].x = 1;return; } if( (tmp_max>cur_thresh) ) { result_flag[0].x = 1;return; } } int num_adds_2 = 2* num_adds; // int j = 1; //float tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct_f(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; { tmp_max = sumtop2_local(tmp_pot, tmp_pot, di,tmp0); } if (tmp_max>dis_thresh) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max>cur_thresh) ) { result_flag[0].x = 1;return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = }
You have multiple data-dependent loops in your kernel and it is not surprising that the runtime may be variable. To debug this, first get a reproducible case, then use printf to debug the variables that control the loops. I would also suggest unrolling the outermost loop to speed up your kernel.
Another suggestion: change every if statement to use the select function.
Originally posted by: Raistmer I have not bad performance for this kernel on HD5xxx GPUs. This behavior is HD6xxx specific, seems you missed this fact. And execution time change from ~10ms to 20 secodns (!) I would not call this "not surprising" Data flow of app can't explain such huge difference.
Please send your runtime code and system information(OS, SDK version, Driver version)
I think the reason for the timeout in this kernel has to be the data dependent loops in the code. Have you confirmed that it is indeed this kernel that is timing out? To do that, put a call to clFinish right after the enqueue and see if you get past it.
Originally posted by: aheirich
Have you confirmed that it is indeed this kernel that is timing out?
According to the link http://vr-zone.com/articles/amd-radeon-hd-6970-and-hd-6950-review/10474-12.html
The GPU 6970 can be overclocked to about 950MHz core and 1450MHz (5800MHz effective) memory that is a modest increase of 70MHz to the core and 75MHz to the memory
Can you please provide the system configuration of both the old host(where driver restarts) and of new host(where it doesn't restart).