1 Reply Latest reply on Jul 29, 2015 8:16 AM by dipak

    Why driver restart here...?

    Raistmer

      Hello

       

      I have some kernel, that runs ~150ms at max. But its small  modification causes driver restart (I think cause it executes >2s in modified state but other reasons possible).

      Modification involves biggest value finding among workgroup via local memory then writing back to global memory.

      Could this cause SO big kernel slowdown that Windows watchdog timer alarms or I just did some error?

       

      here is working version:
      
      __kernel void GaussFit_SoG_kernel_cl(__global float* PoT, __global float* PoTPrefixSum,int ul_FftLength, __global GPUState* gpu_state,
        __constant ocl_GaussFit_t* settings,__constant float* f_weight,
        __global uint* result_flag, __global float4* GaussFitResults,
        __local float* p,__global float* GaussFitScores
        ) {
          int tid =  get_global_id(0);//R: from 0 to fftlen
        int ul_TOffset =  get_global_id(1) + settings->GaussTOffsetStart;//R: something less than 64
        for(int i=0;i<(GAUSS_POT_LENGTH/get_local_size(1));i++){
        p[(get_local_size(1)*i+get_local_id(1))*get_local_size(0)+get_local_id(0)]=PoT[tid+(get_local_size(1)*i+get_local_id(1))*(ul_FftLength)];
        }//R: fetch get_local_size(0) PoTs into local memory
        __local float l_weight[GAUSS_POT_LENGTH];
        int ltid=(get_local_size(0)*get_local_id(1)+get_local_id(0));
        (ltid<GAUSS_POT_LENGTH)?(l_weight[ltid]=f_weight[ltid]):false;//R: fetch weights array into local memory too
        __local float blscore[256];
        float score=-12.f;
        float bscore=gpu_state->gaussians.bscore;
        bool was_reportable=(gpu_state->gaussians.index>0);
        barrier(CLK_LOCAL_MEM_FENCE);
      //R: now all accesses to PoT will go into local memory.
      if(ul_TOffset<settings->GaussTOffsetStop && tid){ //return;//R: we outside of possible gaussian
        float f_null_hyp;
      
      
        int iSigma = settings->iSigma;
      
      
        float f_TrueMean,
        f_ChiSq,
        f_PeakPower;
      
      
        // slide dynamic gaussian across the Power Of Time array
      
      
        // TrueMean is the mean power of the data set minus all power
        // out to 2 sigma from our current TOffset.
        f_TrueMean = GetTrueMean2_fl(
        PoTPrefixSum,
        ul_TOffset,
        2 * iSigma,
        ul_FftLength,tid,GAUSS_POT_LENGTH
        );
      
      
        f_PeakPower = GetPeak_local(
        p,
        ul_TOffset,
        iSigma,
        f_TrueMean,
        settings->PeakScaleFactor,
        ul_FftLength,l_weight
        );
        int out=ul_TOffset * ul_FftLength + tid;
      #if 1
        // worth looking at ?
        int res=(f_PeakPower  < settings->GaussPeakPowerThresh3*f_TrueMean);
        //res&=1;
        //debug[ul_TOffset * ul_FftLength + tid]=res;
        if (res) {
        GaussFitResults[out] = (float4)0.0f;
        return;
        }
      #endif
        // look at it - try to fit
        f_ChiSq = GetChiSq_local(
        p,
        ul_FftLength,
        ul_TOffset,
        f_PeakPower,
        f_TrueMean,
        &f_null_hyp,l_weight,GAUSS_POT_LENGTH,settings->NumDataPoints
        );
        float4 tmp;
      
      
      //R: scores calculation needed only until first reportable Gaussian will be found. Use this fact.
        score=(was_reportable?-12.0f:
        calc_GaussFit_score_fl(f_ChiSq, f_null_hyp,settings->score_offset,GAUSS_POT_LENGTH));
        if ( ((f_ChiSq <=  settings->GaussChiSqThresh) && (f_null_hyp >= settings->gauss_null_chi_sq_thresh)) ||
        ( (score > bscore) && (f_ChiSq <=  settings->gauss_chi_sq_thresh) )
        ) {
        int result_coordinate=(get_global_size(0)>RESULT_SIZE)?
        ((RESULT_SIZE*get_global_id(0))/get_global_size(0)):get_global_id(0);
        result_flag[result_coordinate]=1;// hot attention required to this block
        tmp.x=f_TrueMean;
        tmp.y=f_PeakPower;
        tmp.z=f_ChiSq;
        tmp.w=f_null_hyp;
        } else {
        tmp=(float4)0.0f;
        }
        GaussFitResults[out] = tmp;
        (!was_reportable)?GaussFitScores[out]=score:false;
      }
        if(!was_reportable){//R: Only if no reported Gaussians so far
      /*
        blscore[ltid]=score;
        for(int i=(get_local_size(0)*get_local_size(1)>>1); i>0;i>>=1){
        barrier(CLK_LOCAL_MEM_FENCE);
        if(ltid<i){
        blscore[ltid]=max(blscore[ltid],blscore[ltid+i]);
        }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
      */
        // bscore=(score>bscore?score:bscore);
        // barrier(CLK_LOCAL_MEM_FENCE);
        (ltid==0 && score>bscore)?gpu_state->gaussians.bscore=score:false;
        //(ltid==0 && blscore[0]>bscore)?gpu_state->gaussians.bscore=blscore[0]:false;
        }
      
      
      } // End of gaussfit()
      

       

       

      And this modification in the end causes driver restart:

      if(!was_reportable){//R: Only if no reported Gaussians so far
        blscore[ltid]=score;
        for(int i=(get_local_size(0)*get_local_size(1)>>1); i>0;i>>=1){
        barrier(CLK_LOCAL_MEM_FENCE);
        if(ltid<i){
        blscore[ltid]=max(blscore[ltid],blscore[ltid+i]);
        }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        (ltid==0 && blscore[0]>bscore)?gpu_state->gaussians.bscore=blscore[0]:false;
        }
      

       

      Of course I would prefer real max over WG finding instead of some random max from one of workitems but driver restart...