8 Replies Latest reply on May 1, 2011 9:37 PM by himanshu.gautam

    Warning: kernel has register spilling. Lower performance is expected

    Tristan23
      kernel has register spilling. Lower performance is expected

      After I updated to Catalyst11.3 / SDK2.4 I get the following warnings from clBuildProgram for some of my kernels:

      "Warning: computeFFT32Kernel kernel has register spilling. Lower performance is expected."

      I did not get this message with the previous Catalyst/SDK.

      Please advise.

        • Warning: kernel has register spilling. Lower performance is expected
          himanshu.gautam

          Please post the device you were using.

          Does the performance actually degrade when you run it after this warning?

          You can try to analyze the kernel using AMD APP kernel analyzer or post the kernel here.

          Thanks

            • Warning: kernel has register spilling. Lower performance is expected
              Tristan23

              My device: E-350 Zacate APU

              But from what I know other devices are affected as well (58xx series).

              Here are the 3 kernels that produce this warning message:

               

              __kernel void PC_single_pulse_kernel_FFA_update_vectorised(__global float4* gpu_power, __constant float4* thresh, __global uint* results, const int num_dchunks, const int need_small_FFA_data,const int need_large_FFA_data, __global float4* small_neg,__global float4* small_pos, __global float4* large_neg,__global float4* large_pos) { //R: each work-item will handle separate data_chunk/dm/sign power array through all needed coadds //R: to avoid host to device data transfers (very costly for current implementation) //R: thresholds and best power values will be maintained separately for CPU and GPU //R: thresholds are constant, but best pulses will need update. //R: need_*_FFA_data are non-zero (if needed) offsets into gpu_power buffer //R: we store small FFA buffers (negative first, then positive) right after coadds buffer //R: (coadds buffer take same size as initial power array thought it actually slightly smaller, just to simplify //R: buffer size calculations for now) then larger FFA buffers go, first negative one, next positive, to match with //R: signs order for GPU dechirping. uint tid=get_global_id(0); uint dchunk=get_global_id(1); uint len4 = 32768 >> 3; uint4 coadd_offsets=(uint4)(0); uint4 coadd_begins=(uint4)(0); uint next_begin,next_offset; float4 t=thresh[0];//thresholds for first 4 coadds float4 tt0=(float4)(t.x);//R:will keep current threshld for vector operations float4 tt1=(float4)(t.y);//R:will keep current threshld for vector operations float4 tt2=(float4)(t.z);//R:will keep current threshld for vector operations float4 tt3=(float4)(t.w);//R:will keep current threshld for vector operations uint result=0;//R: will contain info about location of found pulses and best signals int4 pulse0=(int4)(0);//R: result of comparison int4 pulse1=(int4)(0);//R: result of comparison int4 pulse2=(int4)(0);//R: result of comparison int4 pulse3=(int4)(0);//R: result of comparison uint4 was_pulse=(uint4)(0); //R: low 16 bits for pulses above threshold, each coadd level coded as enabled bit in corresponding position //R: high 16 bits for best pulses, if were, again, enabled bit means found pulse //R: will do unroll for coadds to save reads from global memory float4 d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15; //R: no arrays in registers still (AMD still had to implement it) so store data in separate registers coadd_begins.x=0;//R:initial array - no need to write, in gpu_power domain coadd_begins.y=num_dchunks*32*(32768>>3); coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>4)*32; coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>5)*32; next_begin=coadd_begins.w+num_dchunks*(32768>>6)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>3);//l==0 coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>4); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>5); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>6); next_offset=next_begin+(dchunk*32+tid)*(32768>>7); for(int m=0;m<len4;m+=16){//R:loading data piece of 16*4 samples int m1=coadd_offsets.x+m; d0=gpu_power[m1];d1=gpu_power[m1+1];d2=gpu_power[m1+2];d3=gpu_power[m1+3]; d4=gpu_power[m1+4];d5=gpu_power[m1+5];d6=gpu_power[m1+6];d7=gpu_power[m1+7]; d8=gpu_power[m1+8];d9=gpu_power[m1+9];d10=gpu_power[m1+10];d11=gpu_power[m1+11]; d12=gpu_power[m1+12];d13=gpu_power[m1+13];d14=gpu_power[m1+14];d15=gpu_power[m1+15]; pulse0|=(d0>tt0)|(d1>tt0)|(d2>tt0)|(d3>tt0)|(d4>tt0)|(d5>tt0)|(d6>tt0)|(d7>tt0)| (d8>tt0)|(d9>tt0)|(d10>tt0)|(d11>tt0)|(d12>tt0)|(d13>tt0)|(d14>tt0)|(d15>tt0); //R: now do coadd and save coadded piece in memory d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: next coadd level ready to check for pulses pulse1|=(d0>tt1)|(d1>tt1)|(d2>tt1)|(d3>tt1)|(d4>tt1)|(d5>tt1)|(d6>tt1)|(d7>tt1); //R: coadd level 2 starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; pulse2|=(d0>tt2)|(d1>tt2)|(d2>tt2)|(d3>tt2); //R: coadd level 3 starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; pulse3|=(d0>tt3)|(d1>tt3); //R: now only coadd and store, need to refresh array in registers so memory read required d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw; m1=(m>>4)+next_offset; gpu_power[m1]=d0; //R: level 4 coadd saves data for small FFA sometimes if(need_small_FFA_data!=-1){//R: after some data chunk we will not store data for small FFA if(tid==0){ //R: Store negative DM sign power data for small FFA small_neg[need_small_FFA_data+(m>>4)+dchunk*(32768>>7)]=d0; }else if(tid==1){//R: positive DM sign for small FFA small_pos[need_small_FFA_data+(m>>4)+dchunk*(32768>>7)]=d0; } } }//R: initial and 3 coadded arrays are checked, now need to store results of check and reload for next levels was_pulse.x=1&(pulse0.x|pulse0.y|pulse0.z|pulse0.w); was_pulse.y=1&(pulse1.x|pulse1.y|pulse1.z|pulse1.w); was_pulse.z=1&(pulse2.x|pulse2.y|pulse2.z|pulse2.w); was_pulse.w=1&(pulse3.x|pulse3.y|pulse3.z|pulse3.w); result+=(was_pulse.x)+(was_pulse.y<<1)+(was_pulse.z<<2)+(was_pulse.w<<3); //R: now lets repeat for next few coadd levels and refill registers array from GPU memory t=thresh[1]; tt0=(float4)(t.x); tt1=(float4)(t.y); tt2=(float4)(t.z); tt3=(float4)(t.w); was_pulse=(uint4)(0); pulse0=(int4)(0); pulse1=(int4)(0); pulse2=(int4)(0); pulse3=(int4)(0); len4 = 32768 >>(3+4);//R:4 coadds were done already coadd_begins.x=next_begin;//R: will fetch data stored at the end of prev coadd bunch loop coadd_begins.y=coadd_begins.x+num_dchunks*(32768>>7)*32; coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>8)*32; coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>9)*32; next_begin=coadd_begins.w+num_dchunks*(32768>>10)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>7); coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>8); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>9); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>10); next_offset=next_begin+(dchunk*32+tid)*(32768>>11); //R:now repeating same loop as for first for(int m=0;m<len4;m+=16){//R:loading data piece of 16*4 samples int m1=m+coadd_offsets.x; d0=gpu_power[m1];d1=gpu_power[m1+1];d2=gpu_power[m1+2];d3=gpu_power[m1+3]; d4=gpu_power[m1+4];d5=gpu_power[m1+5];d6=gpu_power[m1+6];d7=gpu_power[m1+7]; d8=gpu_power[m1+8];d9=gpu_power[m1+9];d10=gpu_power[m1+10];d11=gpu_power[m1+11]; d12=gpu_power[m1+12];d13=gpu_power[m1+13];d14=gpu_power[m1+14];d15=gpu_power[m1+15]; //R: coadd level 4 check pulse0|=(d0>tt0)|(d1>tt0)|(d2>tt0)|(d3>tt0)|(d4>tt0)|(d5>tt0)|(d6>tt0)|(d7>tt0)| (d8>tt0)|(d9>tt0)|(d10>tt0)|(d11>tt0)|(d12>tt0)|(d13>tt0)|(d14>tt0)|(d15>tt0); //R: now do coadd and save coadded piece in memory d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: coadd level 5 check pulse1|=(d0>tt1)|(d1>tt1)|(d2>tt1)|(d3>tt1)|(d4>tt1)|(d5>tt1)|(d6>tt1)|(d7>tt1); //R: next coadd level starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; //R: coadd level 6 check pulse2|=(d0>tt2)|(d1>tt2)|(d2>tt2)|(d3>tt2); d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; if(need_large_FFA_data!=-1){//R: need to save data for large FFA only at beginning of large DM chunk if(tid==0){ //R: Store negative DM sign power data for large FFA large_neg[need_large_FFA_data+(m>>3)+dchunk*(32768>>10)]=d0; large_neg[need_large_FFA_data+(m>>3)+dchunk*(32768>>10)+1]=d1; }else if(tid==1){//R: positive DM sign for large FFA large_pos[need_large_FFA_data+(m>>3)+dchunk*(32768>>10)]=d0; large_pos[need_large_FFA_data+(m>>3)+dchunk*(32768>>10)+1]=d1; } } //R: coadd level 7 check pulse3|=(d0>tt3)|(d1>tt3); //R: now only coadd and store, need to refresh array in registers so memory read required d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw; m1=(m>>4)+next_offset; gpu_power[m1]=d0;//R: coadd level 8 store } was_pulse.x=1&(pulse0.x|pulse0.y|pulse0.z|pulse0.w); was_pulse.y=1&(pulse1.x|pulse1.y|pulse1.z|pulse1.w); was_pulse.z=1&(pulse2.x|pulse2.y|pulse2.z|pulse2.w); was_pulse.w=1&(pulse3.x|pulse3.y|pulse3.z|pulse3.w); result+=(was_pulse.x<<4)+(was_pulse.y<<5)+(was_pulse.z<<6)+(was_pulse.w<<7); //R: now lets repeat for next few coadd levels and refill registers array from GPU memory t=thresh[2]; tt0=(float4)(t.x); tt1=(float4)(t.y); tt2=(float4)(t.z); tt3=(float4)(t.w); was_pulse=(uint4)(0); pulse0=(int4)(0); pulse1=(int4)(0); pulse2=(int4)(0); pulse3=(int4)(0); coadd_begins.x=next_begin;//R: will fetch data stored at the end of prev coadd bunch loop coadd_begins.y=coadd_begins.x+num_dchunks*(32768>>11)*32; coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>12)*32; coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>13)*32; next_begin=coadd_begins.w+num_dchunks*(32768>>14)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>11); coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>12); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>13); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>14); next_offset=next_begin+(dchunk*32+tid)*(32768>>15); //R: 32768>>11==16 so we have only 16 float4 elements, no loop by m needed. int m1=coadd_offsets.x; d0=gpu_power[m1];d1=gpu_power[m1+1];d2=gpu_power[m1+2];d3=gpu_power[m1+3]; d4=gpu_power[m1+4];d5=gpu_power[m1+5];d6=gpu_power[m1+6];d7=gpu_power[m1+7]; d8=gpu_power[m1+8];d9=gpu_power[m1+9];d10=gpu_power[m1+10];d11=gpu_power[m1+11]; d12=gpu_power[m1+12];d13=gpu_power[m1+13];d14=gpu_power[m1+14];d15=gpu_power[m1+15]; //R: coadd level 8 check pulse0|=(d0>tt0)|(d1>tt0)|(d2>tt0)|(d3>tt0)|(d4>tt0)|(d5>tt0)|(d6>tt0)|(d7>tt0)| (d8>tt0)|(d9>tt0)|(d10>tt0)|(d11>tt0)|(d12>tt0)|(d13>tt0)|(d14>tt0)|(d15>tt0); //R: now do coadd and save coadded piece in memory d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: coadd level 9 check,last needed coadd pulse1|=(d0>tt1)|(d1>tt1)|(d2>tt1)|(d3>tt1)|(d4>tt1)|(d5>tt1)|(d6>tt1)|(d7>tt1); was_pulse.x=1&(pulse0.x|pulse0.y|pulse0.z|pulse0.w); was_pulse.y=1&(pulse1.x|pulse1.y|pulse1.z|pulse1.w); result+=(was_pulse.x<<8)+(was_pulse.y<<9); //R: last we do is to write result of pulse finding results[32*dchunk+tid]=result; } __kernel void PC_single_pulse_kernel_FFA_update_reduce0(__global float4* gpu_power, __constant float4* thresh, __global uint* results, const int num_dchunks, const int need_small_FFA_data,//const int need_large_FFA_data, __global float4* small_neg,__global float4* small_pos//, //__global float4* large_neg,__global float4* large_pos ) { //R: each work-item will handle separate data_chunk/dm/sign power array through all needed coadds //R: to avoid host to device data transfers (very costly for current implementation) //R: thresholds and best power values will be maintained separately for CPU and GPU //R: thresholds are constant, but best pulses will need update. //R: need_*_FFA_data are non-zero (if needed) offsets into gpu_power buffer //R: we store small FFA buffers (negative first, then positive) right after coadds buffer //R: (coadds buffer take same size as initial power array thought it actually slightly smaller, just to simplify //R: buffer size calculations for now) then larger FFA buffers go, first negative one, next positive, to match with //R: signs order for GPU dechirping. uint tid=get_global_id(0); uint dchunk=get_global_id(1); int mchunk=get_global_id(2); // uint len4 = 32768 >> 3; // uint4 coadd_offsets=(uint4)(0); // uint4 coadd_begins=(uint4)(0); uint next_begin,next_offset; float4 t=thresh[0];//thresholds for first 4 coadds float4 tt0=(float4)(t.x);//R:will keep current threshold for vector operations float4 tt1=(float4)(t.y);//R:will keep current threshold for vector operations float4 tt2=(float4)(t.z);//R:will keep current threshold for vector operations float4 tt3=(float4)(t.w);//R:will keep current threshold for vector operations //uint result=0; //int4 pulse0=(int4)(0);//R: result of comparison //int4 pulse1=(int4)(0);//R: result of comparison //int4 pulse2=(int4)(0);//R: result of comparison //int4 pulse3=(int4)(0);//R: result of comparison int4 was_pulse=(int4)(0);////R: simple yes/no flag: was single pulse found somewhere or not; //vector for vectorized ops float4 d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15; //R: no arrays in registers still (AMD still had to implement it) so store data in separate registers //R: Coadded values storage address arithmetics now greatly simplified //R: only 1 coadd level stored per kernel call so max 2 coadds need to be addressed next_begin=num_dchunks*32*(32768>>3);//R: area just after initial power array next_offset=next_begin+(dchunk*32+tid)*(32768>>7);//R: don't account for mchunk offset //coadd_begins.x=0;//R:initial array - no need to write, in gpu_power domain //coadd_begins.y=num_dchunks*32*(32768>>3); //coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>4)*32; //coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>5)*32; //next_begin=coadd_begins.w+num_dchunks*(32768>>6)*32; //coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>3);//l==0 //coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>4); //coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>5); //coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>6); //next_offset=next_begin+(dchunk*32+tid)*(32768>>7); //for(int m=0;m<len4;m+=16) {//R:loading data piece of 16*4 samples int m1=(dchunk*32+tid)*(32768>>3)+(mchunk<<4);//R:reading corresponding mchunk (16*4 samples here) from power array - coadd level 0 d0=gpu_power[m1];d1=gpu_power[m1+1];d2=gpu_power[m1+2];d3=gpu_power[m1+3]; d4=gpu_power[m1+4];d5=gpu_power[m1+5];d6=gpu_power[m1+6];d7=gpu_power[m1+7]; d8=gpu_power[m1+8];d9=gpu_power[m1+9];d10=gpu_power[m1+10];d11=gpu_power[m1+11]; d12=gpu_power[m1+12];d13=gpu_power[m1+13];d14=gpu_power[m1+14];d15=gpu_power[m1+15]; was_pulse|=(d0>tt0)|(d1>tt0)|(d2>tt0)|(d3>tt0)|(d4>tt0)|(d5>tt0)|(d6>tt0)|(d7>tt0)| (d8>tt0)|(d9>tt0)|(d10>tt0)|(d11>tt0)|(d12>tt0)|(d13>tt0)|(d14>tt0)|(d15>tt0); //R: now docoadd and save coadded piece in registers d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: next coadd level ready to check for pulses was_pulse|=(d0>tt1)|(d1>tt1)|(d2>tt1)|(d3>tt1)|(d4>tt1)|(d5>tt1)|(d6>tt1)|(d7>tt1); //R: coadd level 2 starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; was_pulse|=(d0>tt2)|(d1>tt2)|(d2>tt2)|(d3>tt2); //R: coadd level 3 starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; was_pulse|=(d0>tt3)|(d1>tt3); //R: now only coadd and store, need to refresh array in registers so memory read required d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw; m1=mchunk+next_offset; gpu_power[m1]=d0; //R: level 4 coadd saves data for small FFA sometimes if(need_small_FFA_data!=-1){//R: after some data chunk we will not store data for small FFA if(tid==0){ //R: Store negative DM sign power data for small FFA small_neg[need_small_FFA_data+mchunk+dchunk*(32768>>7)]=d0; }else if(tid==1){//R: positive DM sign for small FFA small_pos[need_small_FFA_data+mchunk+dchunk*(32768>>7)]=d0; } } } was_pulse.x=was_pulse.x||was_pulse.y||was_pulse.z||was_pulse.w;//result reduction if(was_pulse.x) results[0]=1; } __kernel void PC_single_pulse_kernel_FFA_update_reduce1(__global float4* gpu_power, __constant float4* thresh, __global uint* results, const int num_dchunks, //const int need_small_FFA_data, const int need_large_FFA_data, //__global float4* small_neg,__global float4* small_pos//, __global float4* large_neg,__global float4* large_pos ) { uint tid=get_global_id(0); uint dchunk=get_global_id(1); uint mchunk=get_global_id(2); //R: now lets repeat for next few coadd levels and refill registers array from GPU memory float4 t=thresh[1]; float4 tt0=(float4)(t.x);//R:will keep current threshold for vector operations float4 tt1=(float4)(t.y);//R:will keep current threshold for vector operations float4 tt2=(float4)(t.z);//R:will keep current threshold for vector operations float4 tt3=(float4)(t.w);//R:will keep current threshold for vector operations t=thresh[2];//for last 2 levels int4 was_pulse=(int4)(0);////R: simple yes/no flag: was single pulse found somewhere or not; //vector for vectorized ops float4 d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15; int initial_begin=num_dchunks*32*(32768>>3); int initial_offset=initial_begin+(dchunk*32+tid)*(32768>>7);//R:we reading already coadded date here, for particular dchunk //int next_offset=initial_begin+num_dchunks*(32768>>7)*32+(dchunk*32+tid)*(32768>>11); //len4 = 32768 >>(3+4);//R:4 coadds were done already //coadd_begins.x=next_begin;//R: will fetch data stored at the end of prev coadd bunch loop //coadd_begins.y=coadd_begins.x+num_dchunks*(32768>>7)*32; //coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>8)*32; //coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>9)*32; //next_begin=coadd_begins.w+num_dchunks*(32768>>10)*32; //coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>7); //coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>8); //coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>9); //coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>10); //next_offset=next_begin+(dchunk*32+tid)*(32768>>11); //R:now repeating same loop as for first //for(int m=0;m<len4;m+=16) {//R:loading data piece of 16*4 samples int m1=(mchunk<<4)+initial_offset; d0=gpu_power[m1];d1=gpu_power[m1+1];d2=gpu_power[m1+2];d3=gpu_power[m1+3]; d4=gpu_power[m1+4];d5=gpu_power[m1+5];d6=gpu_power[m1+6];d7=gpu_power[m1+7]; d8=gpu_power[m1+8];d9=gpu_power[m1+9];d10=gpu_power[m1+10];d11=gpu_power[m1+11]; d12=gpu_power[m1+12];d13=gpu_power[m1+13];d14=gpu_power[m1+14];d15=gpu_power[m1+15]; //R: coadd level 4 check was_pulse|=(d0>tt0)|(d1>tt0)|(d2>tt0)|(d3>tt0)|(d4>tt0)|(d5>tt0)|(d6>tt0)|(d7>tt0)| (d8>tt0)|(d9>tt0)|(d10>tt0)|(d11>tt0)|(d12>tt0)|(d13>tt0)|(d14>tt0)|(d15>tt0); //R: now do coadd and save coadded piece in registers d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: coadd level 5 check was_pulse|=(d0>tt1)|(d1>tt1)|(d2>tt1)|(d3>tt1)|(d4>tt1)|(d5>tt1)|(d6>tt1)|(d7>tt1); //R: next coadd level starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; //R: coadd level 6 check was_pulse|=(d0>tt2)|(d1>tt2)|(d2>tt2)|(d3>tt2); d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; if(need_large_FFA_data!=-1){//R: need to save data for large FFA only at beginning of large DM chunk if(tid==0){ //R: Store negative DM sign power data for large FFA large_neg[need_large_FFA_data+(mchunk<<1)+dchunk*(32768>>10)]=d0; large_neg[need_large_FFA_data+(mchunk<<1)+dchunk*(32768>>10)+1]=d1; }else if(tid==1){//R: positive DM sign for large FFA large_pos[need_large_FFA_data+(mchunk<<1)+dchunk*(32768>>10)]=d0; large_pos[need_large_FFA_data+(mchunk<<1)+dchunk*(32768>>10)+1]=d1; } } //R: coadd level 7 check was_pulse|=(d0>tt3)|(d1>tt3); was_pulse.x|=was_pulse.y|was_pulse.z|was_pulse.w;//result reduction d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw; //R: coadd level 8 check was_pulse.x|=(d0.x>t.x)|(d0.y>t.x)|(d0.z>t.x)|(d0.w>t.x); //R: now do coadd and save coadded piece in registers d0.xy=d0.xz+d0.yw; //R: coadd level 9 check,last needed coadd was_pulse.x|=(d0.x>t.y)|(d0.y>t.y); } if(was_pulse.x) results[0]=1; }

            • Warning: kernel has register spilling. Lower performance is expected
              MicahVillmow
              Tristan23,
              Nou is correct, the warning was enabled in SDK 2.4, but it does not mean that spilling did not occur before 2.4. If you find that we introduced a performance regression caused by the spilling in SDK 2.4, please let us know so we can fix it.