Raistmer

Driver restarts on HD6970

Discussion created by Raistmer on Feb 17, 2011
Latest reply on Feb 22, 2011 by Raistmer
works ok on HD5xxx GPUs

My app experiences driver restarts on HD6970 GPUs time to time.

Looks like one of kernels sometimes takes too long to execute:


PC_find_triplets_avg_kernel_cl__k11_Cayman1 15809 { 256 17 1} NULL 0,19282 0 14 0 3
ReadBuffer 15810 0,25789 0,02
set_mem_kernel_cl__k4_Cayman1 15811 { 1 1 1} NULL 0,00339 0 3 0 1
PC_find_pulse_local_kernel_cl__k21_Cayman1 15812 { 1024 17 1} { 64 1 1} 20628,77946 10496 62 0 7

Such huge time causes driver restart.
Usually this kernel take much lower amount of time:


PC_find_triplets_avg_kernel_cl__k11_Cayman1 155 { 256 17 1} NULL 0,19387 0 14 0 3
ReadBuffer 156 25,51814 0,02
set_mem_kernel_cl__k4_Cayman1 157 { 1 1 1} NULL 0,00349 0 3 0 1
PC_find_pulse_local_kernel_cl__k21_Cayman1 158 { 1024 17 1} { 64 1 1} 10,72624 10496 62 0 7
ReadBuffer 159 0,31345 0,02

This doesn't happen on HD5xxx GPUs, only on HD6xxx ones.

Is it driver problem? What is wrong in this particular kernel for HD6xxx GPUs ?

//R: this kernel should be used for small power arrays that fit in local memory comletely __kernel void PC_find_pulse_local_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float* PoT,__local float* tmp_PoT,__global uint4* result_flag, __global float* PulsePoT_average) { //if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 1024*1024/fft_len; int ul_PoT = get_global_id(0); int y = get_global_id(1); int z=get_local_id(0); int TOffset1 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len); __local float* tmp_pot = tmp_PoT +z; //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float avg;//,maxp=0;//, snr=0, fthresh=0; float tmp_max; avg=PulsePoT_average[ul_PoT+y*fft_len]; // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // //int num_adds = pass; for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; switch(num_adds) { case 3: lastP = (PulsePotLen*2)/3; firstP = (PulsePotLen*1)/2; break; case 4: lastP = (PulsePotLen*3)/4; firstP = (PulsePotLen*3)/5; break; case 5: lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; break; } int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (12/num_adds_minus1); perdiv = num_adds_minus1; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct_f(di, num_adds, 0, PulseMax, t_funct_cache_fp) * avg; switch(num_adds) { case 3: tmp_max = sumtop3_local(fp_PulsePot, tmp_pot, di, tmp0, tmp1,fft_len); break; case 4: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp_max = sumtop4_local(fp_PulsePot, tmp_pot, di, tmp0, tmp1, tmp2,fft_len); break; case 5: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5_local(fp_PulsePot, tmp_pot, di, tmp0, tmp1, tmp2, tmp3,fft_len); break; } if (tmp_max>dis_thresh) { // unscale for reporting tmp_max /= num_adds; cur_thresh = (dis_thresh / num_adds - avg) * rcfg_dis_thresh + avg; float _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr > best_pulse_score ) { result_flag[0].x = 1;return; } if( (tmp_max>cur_thresh) ) { result_flag[0].x = 1;return; } } int num_adds_2 = 2* num_adds; // int j = 1; //float tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct_f(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; { tmp_max = sumtop2_local(tmp_pot, tmp_pot, di,tmp0); } if (tmp_max>dis_thresh) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max>cur_thresh) ) { result_flag[0].x = 1;return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = }

Outcomes