I have some kernel, that runs ~150ms at max. But its small modification causes driver restart (I think cause it executes >2s in modified state but other reasons possible).
Modification involves biggest value finding among workgroup via local memory then writing back to global memory.
Could this cause SO big kernel slowdown that Windows watchdog timer alarms or I just did some error?
here is working version:
__kernel void GaussFit_SoG_kernel_cl(__global float* PoT, __global float* PoTPrefixSum,int ul_FftLength, __global GPUState* gpu_state,
__constant ocl_GaussFit_t* settings,__constant float* f_weight,
__global uint* result_flag, __global float4* GaussFitResults,
__local float* p,__global float* GaussFitScores
) {
int tid = get_global_id(0);//R: from 0 to fftlen
int ul_TOffset = get_global_id(1) + settings->GaussTOffsetStart;//R: something less than 64
for(int i=0;i<(GAUSS_POT_LENGTH/get_local_size(1));i++){
p[(get_local_size(1)*i+get_local_id(1))*get_local_size(0)+get_local_id(0)]=PoT[tid+(get_local_size(1)*i+get_local_id(1))*(ul_FftLength)];
}//R: fetch get_local_size(0) PoTs into local memory
__local float l_weight[GAUSS_POT_LENGTH];
int ltid=(get_local_size(0)*get_local_id(1)+get_local_id(0));
(ltid<GAUSS_POT_LENGTH)?(l_weight[ltid]=f_weight[ltid]):false;//R: fetch weights array into local memory too
__local float blscore[256];
float score=-12.f;
float bscore=gpu_state->gaussians.bscore;
bool was_reportable=(gpu_state->gaussians.index>0);
barrier(CLK_LOCAL_MEM_FENCE);
//R: now all accesses to PoT will go into local memory.
if(ul_TOffset<settings->GaussTOffsetStop && tid){ //return;//R: we outside of possible gaussian
float f_null_hyp;
int iSigma = settings->iSigma;
float f_TrueMean,
f_ChiSq,
f_PeakPower;
// slide dynamic gaussian across the Power Of Time array
// TrueMean is the mean power of the data set minus all power
// out to 2 sigma from our current TOffset.
f_TrueMean = GetTrueMean2_fl(
PoTPrefixSum,
ul_TOffset,
2 * iSigma,
ul_FftLength,tid,GAUSS_POT_LENGTH
);
f_PeakPower = GetPeak_local(
p,
ul_TOffset,
iSigma,
f_TrueMean,
settings->PeakScaleFactor,
ul_FftLength,l_weight
);
int out=ul_TOffset * ul_FftLength + tid;
#if 1
// worth looking at ?
int res=(f_PeakPower < settings->GaussPeakPowerThresh3*f_TrueMean);
//res&=1;
//debug[ul_TOffset * ul_FftLength + tid]=res;
if (res) {
GaussFitResults[out] = (float4)0.0f;
return;
}
#endif
// look at it - try to fit
f_ChiSq = GetChiSq_local(
p,
ul_FftLength,
ul_TOffset,
f_PeakPower,
f_TrueMean,
&f_null_hyp,l_weight,GAUSS_POT_LENGTH,settings->NumDataPoints
);
float4 tmp;
//R: scores calculation needed only until first reportable Gaussian will be found. Use this fact.
score=(was_reportable?-12.0f:
calc_GaussFit_score_fl(f_ChiSq, f_null_hyp,settings->score_offset,GAUSS_POT_LENGTH));
if ( ((f_ChiSq <= settings->GaussChiSqThresh) && (f_null_hyp >= settings->gauss_null_chi_sq_thresh)) ||
( (score > bscore) && (f_ChiSq <= settings->gauss_chi_sq_thresh) )
) {
int result_coordinate=(get_global_size(0)>RESULT_SIZE)?
((RESULT_SIZE*get_global_id(0))/get_global_size(0)):get_global_id(0);
result_flag[result_coordinate]=1;// hot attention required to this block
tmp.x=f_TrueMean;
tmp.y=f_PeakPower;
tmp.z=f_ChiSq;
tmp.w=f_null_hyp;
} else {
tmp=(float4)0.0f;
}
GaussFitResults[out] = tmp;
(!was_reportable)?GaussFitScores[out]=score:false;
}
if(!was_reportable){//R: Only if no reported Gaussians so far
/*
blscore[ltid]=score;
for(int i=(get_local_size(0)*get_local_size(1)>>1); i>0;i>>=1){
barrier(CLK_LOCAL_MEM_FENCE);
if(ltid<i){
blscore[ltid]=max(blscore[ltid],blscore[ltid+i]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
*/
// bscore=(score>bscore?score:bscore);
// barrier(CLK_LOCAL_MEM_FENCE);
(ltid==0 && score>bscore)?gpu_state->gaussians.bscore=score:false;
//(ltid==0 && blscore[0]>bscore)?gpu_state->gaussians.bscore=blscore[0]:false;
}
} // End of gaussfit()
if(!was_reportable){//R: Only if no reported Gaussians so far
blscore[ltid]=score;
for(int i=(get_local_size(0)*get_local_size(1)>>1); i>0;i>>=1){
barrier(CLK_LOCAL_MEM_FENCE);
if(ltid<i){
blscore[ltid]=max(blscore[ltid],blscore[ltid+i]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
(ltid==0 && blscore[0]>bscore)?gpu_state->gaussians.bscore=blscore[0]:false;
}
Of course I would prefer real max over WG finding instead of some random max from one of workitems but driver restart...