1 Reply Latest reply on May 6, 2010 8:47 PM by bpurnomo

    SKA 1.5 can't show statistic for kernel

    Raistmer
      it shows assembly only

      all columns have N/A value for all GPUs.
      But this kernel works OK on my HD4870.

      // Enter your kernel in this window __kernel void PC_single_pulse_kernel_FFA_update(__global float* gpu_power, __constant float* thresh, __global uint* results, const int num_dchunks, const int need_small_FFA_data,const int need_large_FFA_data, __global float* small_neg,__global float* small_pos, __global float* large_neg,__global float* large_pos) { uint tid=get_global_id(0); uint dchunk=get_global_id(1); uint len4 = 32768 >> 3; uint4 coadd_offsets=(uint4)(0); uint4 coadd_begins=(uint4)(0); uint next_begin,next_offset; float4 t=vload4(0,thresh);//thresholds for first 4 coadds uint result=0;//R: will contain info about location of found pulses and best signals uint4 was_pulse=(uint4)(0); float4 d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15; coadd_begins.x=0;//R:initial array - no need to write, in gpu_power domain coadd_begins.y=num_dchunks*32*(32768>>3); coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>4)*32; coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>5)*32; next_begin=coadd_begins.w+num_dchunks*(32768>>6)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>3);//l==0 coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>4); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>5); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>6); next_offset=next_begin+(dchunk*32+tid)*(32768>>7); for(int m=0;m<len4;m+=16){//R:loading data piece of 16*4 samples int m1=coadd_offsets.x+m; d0=vload4(m1,gpu_power);d1=vload4(m1+1,gpu_power);d2=vload4(m1+2,gpu_power);d3=vload4(m1+3,gpu_power); d4=vload4(m1+4,gpu_power);d5=vload4(m1+5,gpu_power);d6=vload4(m1+6,gpu_power);d7=vload4(m1+7,gpu_power); d8=vload4(m1+8,gpu_power);d9=vload4(m1+9,gpu_power);d10=vload4(m1+10,gpu_power);d11=vload4(m1+11,gpu_power); d12=vload4(m1+12,gpu_power);d13=vload4(m1+13,gpu_power);d14=vload4(m1+14,gpu_power);d15=vload4(m1+15,gpu_power); if ( (d0.x>t.x)||(d0.y>t.x)||(d0.z>t.x)||(d0.w>t.x)||(d1.x>t.x)||(d1.y>t.x)||(d1.z>t.x)||(d1.w>t.x)|| (d2.x>t.x)||(d2.y>t.x)||(d2.z>t.x)||(d2.w>t.x)||(d3.x>t.x)||(d3.y>t.x)||(d3.z>t.x)||(d3.w>t.x)|| (d4.x>t.x)||(d4.y>t.x)||(d4.z>t.x)||(d4.w>t.x)||(d5.x>t.x)||(d5.y>t.x)||(d5.z>t.x)||(d5.w>t.x)|| (d6.x>t.x)||(d6.y>t.x)||(d6.z>t.x)||(d6.w>t.x)||(d7.x>t.x)||(d7.y>t.x)||(d7.z>t.x)||(d7.w>t.x)|| (d8.x>t.x)||(d8.y>t.x)||(d8.z>t.x)||(d8.w>t.x)||(d9.x>t.x)||(d9.y>t.x)||(d9.z>t.x)||(d9.w>t.x)|| (d10.x>t.x)||(d10.y>t.x)||(d10.z>t.x)||(d10.w>t.x)||(d11.x>t.x)||(d11.y>t.x)||(d11.z>t.x)||(d11.w>t.x)|| (d12.x>t.x)||(d12.y>t.x)||(d12.z>t.x)||(d12.w>t.x)||(d13.x>t.x)||(d13.y>t.x)||(d13.z>t.x)||(d13.w>t.x)|| (d14.x>t.x)||(d14.y>t.x)||(d14.z>t.x)||(d14.w>t.x)||(d15.x>t.x)||(d15.y>t.x)||(d15.z>t.x)||(d15.w>t.x) ){ was_pulse.x=1; } //R: now do coadd and save coadded piece in memory d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: next coadd level ready to check for pulses if ( (d0.x>t.y)||(d0.y>t.y)||(d0.z>t.y)||(d0.w>t.y)||(d1.x>t.y)||(d1.y>t.y)||(d1.z>t.y)||(d1.w>t.y)|| (d2.x>t.y)||(d2.y>t.y)||(d2.z>t.y)||(d2.w>t.y)||(d3.x>t.y)||(d3.y>t.y)||(d3.z>t.y)||(d3.w>t.y)|| (d4.x>t.y)||(d4.y>t.y)||(d4.z>t.y)||(d4.w>t.y)||(d5.x>t.y)||(d5.y>t.y)||(d5.z>t.y)||(d5.w>t.y)|| (d6.x>t.y)||(d6.y>t.y)||(d6.z>t.y)||(d6.w>t.y)||(d7.x>t.y)||(d7.y>t.y)||(d7.z>t.y)||(d7.w>t.y) ){ was_pulse.y=1; } //R: coadd level 2 starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; if ( (d0.x>t.z)||(d0.y>t.z)||(d0.z>t.z)||(d0.w>t.z)||(d1.x>t.z)||(d1.y>t.z)||(d1.z>t.z)||(d1.w>t.z)|| (d2.x>t.z)||(d2.y>t.z)||(d2.z>t.z)||(d2.w>t.z)||(d3.x>t.z)||(d3.y>t.z)||(d3.z>t.z)||(d3.w>t.z) ){ was_pulse.z=1; } //R: coadd level 3 starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; if ( (d0.x>t.w)||(d0.y>t.w)||(d0.z>t.w)||(d0.w>t.w)||(d1.x>t.w)||(d1.y>t.w)||(d1.z>t.w)||(d1.w>t.w) ){ was_pulse.w=1; } //R: now only coadd and store, need to refresh array in registers so memory read required d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw; m1=(m>>4)+next_offset; vstore4(d0,m1,gpu_power); //R: level 4 coadd saves data for small FFA sometimes if(need_small_FFA_data!=-1){//R: after some data chunk we will not store data for small FFA if(tid==0){ //R: Store negative DM sign power data for small FFA vstore4(d0,need_small_FFA_data+(m>>4)+dchunk*(32768>>7),small_neg); }else if(tid==1){//R: positive DM sign for small FFA vstore4(d0,need_small_FFA_data+(m>>4)+dchunk*(32768>>7),small_pos); } } }//R: initial and 3 coadded arrays are checked, now need to store results of check and reload for next levels result+=(was_pulse.x)+(was_pulse.y<<1)+(was_pulse.z<<2)+(was_pulse.w<<3); //R: now lets repeat for next few coadd levels and refill registers array from GPU memory t=vload4(1,thresh); was_pulse=(uint4)(0); len4 = 32768 >>(3+4);//R:4 coadds were done already coadd_begins.x=next_begin;//R: will fetch data stored at the end of prev coadd bunch loop coadd_begins.y=coadd_begins.x+num_dchunks*(32768>>7)*32; coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>8)*32; coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>9)*32; next_begin=coadd_begins.w+num_dchunks*(32768>>10)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>7); coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>8); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>9); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>10); next_offset=next_begin+(dchunk*32+tid)*(32768>>11); //R:now repeating same loop as for first for(int m=0;m<len4;m+=16){//R:loading data piece of 16*4 samples int m1=m+coadd_offsets.x; d0=vload4(m1,gpu_power);d1=vload4(m1+1,gpu_power);d2=vload4(m1+2,gpu_power);d3=vload4(m1+3,gpu_power); d4=vload4(m1+4,gpu_power);d5=vload4(m1+5,gpu_power);d6=vload4(m1+6,gpu_power);d7=vload4(m1+7,gpu_power); d8=vload4(m1+8,gpu_power);d9=vload4(m1+9,gpu_power);d10=vload4(m1+10,gpu_power);d11=vload4(m1+11,gpu_power); d12=vload4(m1+12,gpu_power);d13=vload4(m1+13,gpu_power);d14=vload4(m1+14,gpu_power);d15=vload4(m1+15,gpu_power); //R: coadd level 4 check if ( (d0.x>t.x)||(d0.y>t.x)||(d0.z>t.x)||(d0.w>t.x)||(d1.x>t.x)||(d1.y>t.x)||(d1.z>t.x)||(d1.w>t.x)|| (d2.x>t.x)||(d2.y>t.x)||(d2.z>t.x)||(d2.w>t.x)||(d3.x>t.x)||(d3.y>t.x)||(d3.z>t.x)||(d3.w>t.x)|| (d4.x>t.x)||(d4.y>t.x)||(d4.z>t.x)||(d4.w>t.x)||(d5.x>t.x)||(d5.y>t.x)||(d5.z>t.x)||(d5.w>t.x)|| (d6.x>t.x)||(d6.y>t.x)||(d6.z>t.x)||(d6.w>t.x)||(d7.x>t.x)||(d7.y>t.x)||(d7.z>t.x)||(d7.w>t.x)|| (d8.x>t.x)||(d8.y>t.x)||(d8.z>t.x)||(d8.w>t.x)||(d9.x>t.x)||(d9.y>t.x)||(d9.z>t.x)||(d9.w>t.x)|| (d10.x>t.x)||(d10.y>t.x)||(d10.z>t.x)||(d10.w>t.x)||(d11.x>t.x)||(d11.y>t.x)||(d11.z>t.x)||(d11.w>t.x)|| (d12.x>t.x)||(d12.y>t.x)||(d12.z>t.x)||(d12.w>t.x)||(d13.x>t.x)||(d13.y>t.x)||(d13.z>t.x)||(d13.w>t.x)|| (d14.x>t.x)||(d14.y>t.x)||(d14.z>t.x)||(d14.w>t.x)||(d15.x>t.x)||(d15.y>t.x)||(d15.z>t.x)||(d15.w>t.x) ){ was_pulse.x=1; } //R: now do coadd and save coadded piece in memory d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: coadd level 5 check if ( (d0.x>t.y)||(d0.y>t.y)||(d0.z>t.y)||(d0.w>t.y)||(d1.x>t.y)||(d1.y>t.y)||(d1.z>t.y)||(d1.w>t.y)|| (d2.x>t.y)||(d2.y>t.y)||(d2.z>t.y)||(d2.w>t.y)||(d3.x>t.y)||(d3.y>t.y)||(d3.z>t.y)||(d3.w>t.y)|| (d4.x>t.y)||(d4.y>t.y)||(d4.z>t.y)||(d4.w>t.y)||(d5.x>t.y)||(d5.y>t.y)||(d5.z>t.y)||(d5.w>t.y)|| (d6.x>t.y)||(d6.y>t.y)||(d6.z>t.y)||(d6.w>t.y)||(d7.x>t.y)||(d7.y>t.y)||(d7.z>t.y)||(d7.w>t.y) ){ was_pulse.y=1; } //R: next coadd level starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; //R: coadd level 6 check if ( (d0.x>t.z)||(d0.y>t.z)||(d0.z>t.z)||(d0.w>t.z)||(d1.x>t.z)||(d1.y>t.z)||(d1.z>t.z)||(d1.w>t.z)|| (d2.x>t.z)||(d2.y>t.z)||(d2.z>t.z)||(d2.w>t.z)||(d3.x>t.z)||(d3.y>t.z)||(d3.z>t.z)||(d3.w>t.z) ){ was_pulse.z=1; } d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; if(need_large_FFA_data!=-1){//R: need to save data for large FFA only at beginning of large DM chunk if(tid==0){ //R: Store negative DM sign power data for large FFA vstore4(d0,need_large_FFA_data+(m>>3)+dchunk*(32768>>10),large_neg); vstore4(d1,need_large_FFA_data+(m>>3)+dchunk*(32768>>10)+1,large_neg); }else if(tid==1){//R: positive DM sign for large FFA vstore4(d0,need_large_FFA_data+(m>>3)+dchunk*(32768>>10),large_pos); vstore4(d1,need_large_FFA_data+(m>>3)+dchunk*(32768>>10)+1,large_pos); } } //R: coadd level 7 check if ( (d0.x>t.w)||(d0.y>t.w)||(d0.z>t.w)||(d0.w>t.w)||(d1.x>t.w)||(d1.y>t.w)||(d1.z>t.w)||(d1.w>t.w) ){ was_pulse.w=1; } //R: now only coadd and store, need to refresh array in registers so memory read required d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw; m1=(m>>4)+next_offset; vstore4(d0,m1,gpu_power);//R: coadd level 8 store } result+=(was_pulse.x<<4)+(was_pulse.y<<5)+(was_pulse.z<<6)+(was_pulse.w<<7); //R: now lets repeat for next few coadd levels and refill registers array from GPU memory t=vload4(2,thresh); was_pulse=(uint4)(0); coadd_begins.x=next_begin;//R: will fetch data stored at the end of prev coadd bunch loop coadd_begins.y=coadd_begins.x+num_dchunks*(32768>>11)*32; coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>12)*32; coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>13)*32; next_begin=coadd_begins.w+num_dchunks*(32768>>14)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>11); coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>12); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>13); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>14); next_offset=next_begin+(dchunk*32+tid)*(32768>>15); //R: 32768>>11==16 so we have only 16 float4 elements, no loop by m needed. int m1=coadd_offsets.x; d0=vload4(m1,gpu_power);d1=vload4(m1+1,gpu_power);d2=vload4(m1+2,gpu_power);d3=vload4(m1+3,gpu_power); d4=vload4(m1+4,gpu_power);d5=vload4(m1+5,gpu_power);d6=vload4(m1+6,gpu_power);d7=vload4(m1+7,gpu_power); d8=vload4(m1+8,gpu_power);d9=vload4(m1+9,gpu_power);d10=vload4(m1+10,gpu_power);d11=vload4(m1+11,gpu_power); d12=vload4(m1+12,gpu_power);d13=vload4(m1+13,gpu_power);d14=vload4(m1+14,gpu_power);d15=vload4(m1+15,gpu_power); //R: coadd level 8 check if ( (d0.x>t.x)||(d0.y>t.x)||(d0.z>t.x)||(d0.w>t.x)||(d1.x>t.x)||(d1.y>t.x)||(d1.z>t.x)||(d1.w>t.x)|| (d2.x>t.x)||(d2.y>t.x)||(d2.z>t.x)||(d2.w>t.x)||(d3.x>t.x)||(d3.y>t.x)||(d3.z>t.x)||(d3.w>t.x)|| (d4.x>t.x)||(d4.y>t.x)||(d4.z>t.x)||(d4.w>t.x)||(d5.x>t.x)||(d5.y>t.x)||(d5.z>t.x)||(d5.w>t.x)|| (d6.x>t.x)||(d6.y>t.x)||(d6.z>t.x)||(d6.w>t.x)||(d7.x>t.x)||(d7.y>t.x)||(d7.z>t.x)||(d7.w>t.x)|| (d8.x>t.x)||(d8.y>t.x)||(d8.z>t.x)||(d8.w>t.x)||(d9.x>t.x)||(d9.y>t.x)||(d9.z>t.x)||(d9.w>t.x)|| (d10.x>t.x)||(d10.y>t.x)||(d10.z>t.x)||(d10.w>t.x)||(d11.x>t.x)||(d11.y>t.x)||(d11.z>t.x)||(d11.w>t.x)|| (d12.x>t.x)||(d12.y>t.x)||(d12.z>t.x)||(d12.w>t.x)||(d13.x>t.x)||(d13.y>t.x)||(d13.z>t.x)||(d13.w>t.x)|| (d14.x>t.x)||(d14.y>t.x)||(d14.z>t.x)||(d14.w>t.x)||(d15.x>t.x)||(d15.y>t.x)||(d15.z>t.x)||(d15.w>t.x) ){ was_pulse.x=1; } //R: now do coadd and save coadded piece in memory d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: coadd level 9 check,last needed coadd if ( (d0.x>t.y)||(d0.y>t.y)||(d0.z>t.y)||(d0.w>t.y)||(d1.x>t.y)||(d1.y>t.y)||(d1.z>t.y)||(d1.w>t.y)|| (d2.x>t.y)||(d2.y>t.y)||(d2.z>t.y)||(d2.w>t.y)||(d3.x>t.y)||(d3.y>t.y)||(d3.z>t.y)||(d3.w>t.y)|| (d4.x>t.y)||(d4.y>t.y)||(d4.z>t.y)||(d4.w>t.y)||(d5.x>t.y)||(d5.y>t.y)||(d5.z>t.y)||(d5.w>t.y)|| (d6.x>t.y)||(d6.y>t.y)||(d6.z>t.y)||(d6.w>t.y)||(d7.x>t.y)||(d7.y>t.y)||(d7.z>t.y)||(d7.w>t.y) ){ was_pulse.y=1; } result+=(was_pulse.x<<8)+(was_pulse.y<<9); //R: last we do is to write result of pulse finding results[32*dchunk+tid]=result; }