cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

Raistmer
Adept II

Why driver restart here...?

Hello

I have some kernel, that runs ~150ms at max. But its small  modification causes driver restart (I think cause it executes >2s in modified state but other reasons possible).

Modification involves biggest value finding among workgroup via local memory then writing back to global memory.

Could this cause SO big kernel slowdown that Windows watchdog timer alarms or I just did some error?

here is working version:

__kernel void GaussFit_SoG_kernel_cl(__global float* PoT, __global float* PoTPrefixSum,int ul_FftLength, __global GPUState* gpu_state,

  __constant ocl_GaussFit_t* settings,__constant float* f_weight,

  __global uint* result_flag, __global float4* GaussFitResults,

  __local float* p,__global float* GaussFitScores

  ) {

    int tid =  get_global_id(0);//R: from 0 to fftlen

  int ul_TOffset =  get_global_id(1) + settings->GaussTOffsetStart;//R: something less than 64

  for(int i=0;i<(GAUSS_POT_LENGTH/get_local_size(1));i++){

  p[(get_local_size(1)*i+get_local_id(1))*get_local_size(0)+get_local_id(0)]=PoT[tid+(get_local_size(1)*i+get_local_id(1))*(ul_FftLength)];

  }//R: fetch get_local_size(0) PoTs into local memory

  __local float l_weight[GAUSS_POT_LENGTH];

  int ltid=(get_local_size(0)*get_local_id(1)+get_local_id(0));

  (ltid<GAUSS_POT_LENGTH)?(l_weight[ltid]=f_weight[ltid]):false;//R: fetch weights array into local memory too

  __local float blscore[256];

  float score=-12.f;

  float bscore=gpu_state->gaussians.bscore;

  bool was_reportable=(gpu_state->gaussians.index>0);

  barrier(CLK_LOCAL_MEM_FENCE);

//R: now all accesses to PoT will go into local memory.

if(ul_TOffset<settings->GaussTOffsetStop && tid){ //return;//R: we outside of possible gaussian

  float f_null_hyp;

  int iSigma = settings->iSigma;

  float f_TrueMean,

  f_ChiSq,

  f_PeakPower;

  // slide dynamic gaussian across the Power Of Time array

  // TrueMean is the mean power of the data set minus all power

  // out to 2 sigma from our current TOffset.

  f_TrueMean = GetTrueMean2_fl(

  PoTPrefixSum,

  ul_TOffset,

  2 * iSigma,

  ul_FftLength,tid,GAUSS_POT_LENGTH

  );

  f_PeakPower = GetPeak_local(

  p,

  ul_TOffset,

  iSigma,

  f_TrueMean,

  settings->PeakScaleFactor,

  ul_FftLength,l_weight

  );

  int out=ul_TOffset * ul_FftLength + tid;

#if 1

  // worth looking at ?

  int res=(f_PeakPower  < settings->GaussPeakPowerThresh3*f_TrueMean);

  //res&=1;

  //debug[ul_TOffset * ul_FftLength + tid]=res;

  if (res) {

  GaussFitResults[out] = (float4)0.0f;

  return;

  }

#endif

  // look at it - try to fit

  f_ChiSq = GetChiSq_local(

  p,

  ul_FftLength,

  ul_TOffset,

  f_PeakPower,

  f_TrueMean,

  &f_null_hyp,l_weight,GAUSS_POT_LENGTH,settings->NumDataPoints

  );

  float4 tmp;

//R: scores calculation needed only until first reportable Gaussian will be found. Use this fact.

  score=(was_reportable?-12.0f:

  calc_GaussFit_score_fl(f_ChiSq, f_null_hyp,settings->score_offset,GAUSS_POT_LENGTH));

  if ( ((f_ChiSq <=  settings->GaussChiSqThresh) && (f_null_hyp >= settings->gauss_null_chi_sq_thresh)) ||

  ( (score > bscore) && (f_ChiSq <=  settings->gauss_chi_sq_thresh) )

  ) {

  int result_coordinate=(get_global_size(0)>RESULT_SIZE)?

  ((RESULT_SIZE*get_global_id(0))/get_global_size(0)):get_global_id(0);

  result_flag[result_coordinate]=1;// hot attention required to this block

  tmp.x=f_TrueMean;

  tmp.y=f_PeakPower;

  tmp.z=f_ChiSq;

  tmp.w=f_null_hyp;

  } else {

  tmp=(float4)0.0f;

  }

  GaussFitResults[out] = tmp;

  (!was_reportable)?GaussFitScores[out]=score:false;

}

  if(!was_reportable){//R: Only if no reported Gaussians so far

/*

  blscore[ltid]=score;

  for(int i=(get_local_size(0)*get_local_size(1)>>1); i>0;i>>=1){

  barrier(CLK_LOCAL_MEM_FENCE);

  if(ltid<i){

  blscore[ltid]=max(blscore[ltid],blscore[ltid+i]);

  }

  }

  barrier(CLK_LOCAL_MEM_FENCE);

*/

  // bscore=(score>bscore?score:bscore);

  // barrier(CLK_LOCAL_MEM_FENCE);

  (ltid==0 && score>bscore)?gpu_state->gaussians.bscore=score:false;

  //(ltid==0 && blscore[0]>bscore)?gpu_state->gaussians.bscore=blscore[0]:false;

  }

} // End of gaussfit()

And this modification in the end causes driver restart:

if(!was_reportable){//R: Only if no reported Gaussians so far

  blscore[ltid]=score;

  for(int i=(get_local_size(0)*get_local_size(1)>>1); i>0;i>>=1){

  barrier(CLK_LOCAL_MEM_FENCE);

  if(ltid<i){

  blscore[ltid]=max(blscore[ltid],blscore[ltid+i]);

  }

  }

  barrier(CLK_LOCAL_MEM_FENCE);

  (ltid==0 && blscore[0]>bscore)?gpu_state->gaussians.bscore=blscore[0]:false;

  }

Of course I would prefer real max over WG finding instead of some random max from one of workitems but driver restart...

0 Likes
1 Reply
dipak
Big Boss

Hi,

My apologies for this delayed reply.

Couple of questions:

1) Does the application stop restarting if you increase the TDR value (TDR = Display driver stopped responding and has recovered )?

2) Did you try to analyse the kernels using CodeXL? If yes, did you any see any difference in performance counters that might impact the execution time?

Regards,

0 Likes