1 Reply Latest reply on Nov 22, 2010 12:27 PM by bpurnomo

    Why profiler reports zero LDS size?

    Raistmer
      local memory allocated inside kernel

      GaussFit_kernel_cl_02E6A328 556 { 16384 64 1} { 64 4 1} 10,59707 0 17 0 2 16384,00 2875,36 159,92 1,00 0,00 0,00 98,85 17,98 53,55 153440,63 3,00 22,09 0,08 0,00 16383,00 0,00 100,00 0,00 0,00

      kernel:
      __kernel void GaussFit_kernel_cl(__global float* PoT, __global float* PoTPrefixSum,int ul_FftLength, float best_gauss_score,
      __constant ocl_GaussFit_t* settings,__constant float* f_weight,
      __global uint4* result_flag, __global float4* GaussFitResults,__global int* debug) {
      int tid = get_global_id(0);//R: from 0 to fftlen
      int ul_TOffset = get_global_id(1) + settings->GaussTOffsetStart;//R: something less than 64
      //R: should be launched with 64x4 geometry. 256 threads are supported by ATI HD5xxx
      __local float p[64*64];
      for(int i=0;i<16;i++){//R: 64/4=16; 4 came from 64x4=256 - max workgroup size for ATI HD5xxx GPUs
      p[(4*i+get_local_id(1))*64+get_local_id(0)]=PoT[tid+(4*i+get_local_id(1))*(ul_FftLength)];
      }//R: fetch 64 PoTs into local memory
      barrier(CLK_LOCAL_MEM_FENCE);
      ......