Raistmer

SKA 1.5 can't show statistic for kernel

Discussion created by Raistmer on May 6, 2010
Latest reply on May 6, 2010 by bpurnomo
it shows assembly only

all columns have N/A value for all GPUs.
But this kernel works OK on my HD4870.

// Enter your kernel in this window __kernel void PC_single_pulse_kernel_FFA_update(__global float* gpu_power, __constant float* thresh, __global uint* results, const int num_dchunks, const int need_small_FFA_data,const int need_large_FFA_data, __global float* small_neg,__global float* small_pos, __global float* large_neg,__global float* large_pos) { uint tid=get_global_id(0); uint dchunk=get_global_id(1); uint len4 = 32768 >> 3; uint4 coadd_offsets=(uint4)(0); uint4 coadd_begins=(uint4)(0); uint next_begin,next_offset; float4 t=vload4(0,thresh);//thresholds for first 4 coadds uint result=0;//R: will contain info about location of found pulses and best signals uint4 was_pulse=(uint4)(0); float4 d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15; coadd_begins.x=0;//R:initial array - no need to write, in gpu_power domain coadd_begins.y=num_dchunks*32*(32768>>3); coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>4)*32; coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>5)*32; next_begin=coadd_begins.w+num_dchunks*(32768>>6)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>3);//l==0 coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>4); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>5); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>6); next_offset=next_begin+(dchunk*32+tid)*(32768>>7); for(int m=0;m<len4;m+=16){//R:loading data piece of 16*4 samples int m1=coadd_offsets.x+m; d0=vload4(m1,gpu_power);d1=vload4(m1+1,gpu_power);d2=vload4(m1+2,gpu_power);d3=vload4(m1+3,gpu_power); d4=vload4(m1+4,gpu_power);d5=vload4(m1+5,gpu_power);d6=vload4(m1+6,gpu_power);d7=vload4(m1+7,gpu_power); d8=vload4(m1+8,gpu_power);d9=vload4(m1+9,gpu_power);d10=vload4(m1+10,gpu_power);d11=vload4(m1+11,gpu_power); d12=vload4(m1+12,gpu_power);d13=vload4(m1+13,gpu_power);d14=vload4(m1+14,gpu_power);d15=vload4(m1+15,gpu_power); if ( (d0.x>t.x)||(d0.y>t.x)||(d0.z>t.x)||(d0.w>t.x)||(d1.x>t.x)||(d1.y>t.x)||(d1.z>t.x)||(d1.w>t.x)|| (d2.x>t.x)||(d2.y>t.x)||(d2.z>t.x)||(d2.w>t.x)||(d3.x>t.x)||(d3.y>t.x)||(d3.z>t.x)||(d3.w>t.x)|| (d4.x>t.x)||(d4.y>t.x)||(d4.z>t.x)||(d4.w>t.x)||(d5.x>t.x)||(d5.y>t.x)||(d5.z>t.x)||(d5.w>t.x)|| (d6.x>t.x)||(d6.y>t.x)||(d6.z>t.x)||(d6.w>t.x)||(d7.x>t.x)||(d7.y>t.x)||(d7.z>t.x)||(d7.w>t.x)|| (d8.x>t.x)||(d8.y>t.x)||(d8.z>t.x)||(d8.w>t.x)||(d9.x>t.x)||(d9.y>t.x)||(d9.z>t.x)||(d9.w>t.x)|| (d10.x>t.x)||(d10.y>t.x)||(d10.z>t.x)||(d10.w>t.x)||(d11.x>t.x)||(d11.y>t.x)||(d11.z>t.x)||(d11.w>t.x)|| (d12.x>t.x)||(d12.y>t.x)||(d12.z>t.x)||(d12.w>t.x)||(d13.x>t.x)||(d13.y>t.x)||(d13.z>t.x)||(d13.w>t.x)|| (d14.x>t.x)||(d14.y>t.x)||(d14.z>t.x)||(d14.w>t.x)||(d15.x>t.x)||(d15.y>t.x)||(d15.z>t.x)||(d15.w>t.x) ){ was_pulse.x=1; } //R: now do coadd and save coadded piece in memory d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: next coadd level ready to check for pulses if ( (d0.x>t.y)||(d0.y>t.y)||(d0.z>t.y)||(d0.w>t.y)||(d1.x>t.y)||(d1.y>t.y)||(d1.z>t.y)||(d1.w>t.y)|| (d2.x>t.y)||(d2.y>t.y)||(d2.z>t.y)||(d2.w>t.y)||(d3.x>t.y)||(d3.y>t.y)||(d3.z>t.y)||(d3.w>t.y)|| (d4.x>t.y)||(d4.y>t.y)||(d4.z>t.y)||(d4.w>t.y)||(d5.x>t.y)||(d5.y>t.y)||(d5.z>t.y)||(d5.w>t.y)|| (d6.x>t.y)||(d6.y>t.y)||(d6.z>t.y)||(d6.w>t.y)||(d7.x>t.y)||(d7.y>t.y)||(d7.z>t.y)||(d7.w>t.y) ){ was_pulse.y=1; } //R: coadd level 2 starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; if ( (d0.x>t.z)||(d0.y>t.z)||(d0.z>t.z)||(d0.w>t.z)||(d1.x>t.z)||(d1.y>t.z)||(d1.z>t.z)||(d1.w>t.z)|| (d2.x>t.z)||(d2.y>t.z)||(d2.z>t.z)||(d2.w>t.z)||(d3.x>t.z)||(d3.y>t.z)||(d3.z>t.z)||(d3.w>t.z) ){ was_pulse.z=1; } //R: coadd level 3 starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; if ( (d0.x>t.w)||(d0.y>t.w)||(d0.z>t.w)||(d0.w>t.w)||(d1.x>t.w)||(d1.y>t.w)||(d1.z>t.w)||(d1.w>t.w) ){ was_pulse.w=1; } //R: now only coadd and store, need to refresh array in registers so memory read required d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw; m1=(m>>4)+next_offset; vstore4(d0,m1,gpu_power); //R: level 4 coadd saves data for small FFA sometimes if(need_small_FFA_data!=-1){//R: after some data chunk we will not store data for small FFA if(tid==0){ //R: Store negative DM sign power data for small FFA vstore4(d0,need_small_FFA_data+(m>>4)+dchunk*(32768>>7),small_neg); }else if(tid==1){//R: positive DM sign for small FFA vstore4(d0,need_small_FFA_data+(m>>4)+dchunk*(32768>>7),small_pos); } } }//R: initial and 3 coadded arrays are checked, now need to store results of check and reload for next levels result+=(was_pulse.x)+(was_pulse.y<<1)+(was_pulse.z<<2)+(was_pulse.w<<3); //R: now lets repeat for next few coadd levels and refill registers array from GPU memory t=vload4(1,thresh); was_pulse=(uint4)(0); len4 = 32768 >>(3+4);//R:4 coadds were done already coadd_begins.x=next_begin;//R: will fetch data stored at the end of prev coadd bunch loop coadd_begins.y=coadd_begins.x+num_dchunks*(32768>>7)*32; coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>8)*32; coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>9)*32; next_begin=coadd_begins.w+num_dchunks*(32768>>10)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>7); coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>8); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>9); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>10); next_offset=next_begin+(dchunk*32+tid)*(32768>>11); //R:now repeating same loop as for first for(int m=0;m<len4;m+=16){//R:loading data piece of 16*4 samples int m1=m+coadd_offsets.x; d0=vload4(m1,gpu_power);d1=vload4(m1+1,gpu_power);d2=vload4(m1+2,gpu_power);d3=vload4(m1+3,gpu_power); d4=vload4(m1+4,gpu_power);d5=vload4(m1+5,gpu_power);d6=vload4(m1+6,gpu_power);d7=vload4(m1+7,gpu_power); d8=vload4(m1+8,gpu_power);d9=vload4(m1+9,gpu_power);d10=vload4(m1+10,gpu_power);d11=vload4(m1+11,gpu_power); d12=vload4(m1+12,gpu_power);d13=vload4(m1+13,gpu_power);d14=vload4(m1+14,gpu_power);d15=vload4(m1+15,gpu_power); //R: coadd level 4 check if ( (d0.x>t.x)||(d0.y>t.x)||(d0.z>t.x)||(d0.w>t.x)||(d1.x>t.x)||(d1.y>t.x)||(d1.z>t.x)||(d1.w>t.x)|| (d2.x>t.x)||(d2.y>t.x)||(d2.z>t.x)||(d2.w>t.x)||(d3.x>t.x)||(d3.y>t.x)||(d3.z>t.x)||(d3.w>t.x)|| (d4.x>t.x)||(d4.y>t.x)||(d4.z>t.x)||(d4.w>t.x)||(d5.x>t.x)||(d5.y>t.x)||(d5.z>t.x)||(d5.w>t.x)|| (d6.x>t.x)||(d6.y>t.x)||(d6.z>t.x)||(d6.w>t.x)||(d7.x>t.x)||(d7.y>t.x)||(d7.z>t.x)||(d7.w>t.x)|| (d8.x>t.x)||(d8.y>t.x)||(d8.z>t.x)||(d8.w>t.x)||(d9.x>t.x)||(d9.y>t.x)||(d9.z>t.x)||(d9.w>t.x)|| (d10.x>t.x)||(d10.y>t.x)||(d10.z>t.x)||(d10.w>t.x)||(d11.x>t.x)||(d11.y>t.x)||(d11.z>t.x)||(d11.w>t.x)|| (d12.x>t.x)||(d12.y>t.x)||(d12.z>t.x)||(d12.w>t.x)||(d13.x>t.x)||(d13.y>t.x)||(d13.z>t.x)||(d13.w>t.x)|| (d14.x>t.x)||(d14.y>t.x)||(d14.z>t.x)||(d14.w>t.x)||(d15.x>t.x)||(d15.y>t.x)||(d15.z>t.x)||(d15.w>t.x) ){ was_pulse.x=1; } //R: now do coadd and save coadded piece in memory d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: coadd level 5 check if ( (d0.x>t.y)||(d0.y>t.y)||(d0.z>t.y)||(d0.w>t.y)||(d1.x>t.y)||(d1.y>t.y)||(d1.z>t.y)||(d1.w>t.y)|| (d2.x>t.y)||(d2.y>t.y)||(d2.z>t.y)||(d2.w>t.y)||(d3.x>t.y)||(d3.y>t.y)||(d3.z>t.y)||(d3.w>t.y)|| (d4.x>t.y)||(d4.y>t.y)||(d4.z>t.y)||(d4.w>t.y)||(d5.x>t.y)||(d5.y>t.y)||(d5.z>t.y)||(d5.w>t.y)|| (d6.x>t.y)||(d6.y>t.y)||(d6.z>t.y)||(d6.w>t.y)||(d7.x>t.y)||(d7.y>t.y)||(d7.z>t.y)||(d7.w>t.y) ){ was_pulse.y=1; } //R: next coadd level starts d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; //R: coadd level 6 check if ( (d0.x>t.z)||(d0.y>t.z)||(d0.z>t.z)||(d0.w>t.z)||(d1.x>t.z)||(d1.y>t.z)||(d1.z>t.z)||(d1.w>t.z)|| (d2.x>t.z)||(d2.y>t.z)||(d2.z>t.z)||(d2.w>t.z)||(d3.x>t.z)||(d3.y>t.z)||(d3.z>t.z)||(d3.w>t.z) ){ was_pulse.z=1; } d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; if(need_large_FFA_data!=-1){//R: need to save data for large FFA only at beginning of large DM chunk if(tid==0){ //R: Store negative DM sign power data for large FFA vstore4(d0,need_large_FFA_data+(m>>3)+dchunk*(32768>>10),large_neg); vstore4(d1,need_large_FFA_data+(m>>3)+dchunk*(32768>>10)+1,large_neg); }else if(tid==1){//R: positive DM sign for large FFA vstore4(d0,need_large_FFA_data+(m>>3)+dchunk*(32768>>10),large_pos); vstore4(d1,need_large_FFA_data+(m>>3)+dchunk*(32768>>10)+1,large_pos); } } //R: coadd level 7 check if ( (d0.x>t.w)||(d0.y>t.w)||(d0.z>t.w)||(d0.w>t.w)||(d1.x>t.w)||(d1.y>t.w)||(d1.z>t.w)||(d1.w>t.w) ){ was_pulse.w=1; } //R: now only coadd and store, need to refresh array in registers so memory read required d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw; m1=(m>>4)+next_offset; vstore4(d0,m1,gpu_power);//R: coadd level 8 store } result+=(was_pulse.x<<4)+(was_pulse.y<<5)+(was_pulse.z<<6)+(was_pulse.w<<7); //R: now lets repeat for next few coadd levels and refill registers array from GPU memory t=vload4(2,thresh); was_pulse=(uint4)(0); coadd_begins.x=next_begin;//R: will fetch data stored at the end of prev coadd bunch loop coadd_begins.y=coadd_begins.x+num_dchunks*(32768>>11)*32; coadd_begins.z=coadd_begins.y+num_dchunks*(32768>>12)*32; coadd_begins.w=coadd_begins.z+num_dchunks*(32768>>13)*32; next_begin=coadd_begins.w+num_dchunks*(32768>>14)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>11); coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>12); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>13); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>14); next_offset=next_begin+(dchunk*32+tid)*(32768>>15); //R: 32768>>11==16 so we have only 16 float4 elements, no loop by m needed. int m1=coadd_offsets.x; d0=vload4(m1,gpu_power);d1=vload4(m1+1,gpu_power);d2=vload4(m1+2,gpu_power);d3=vload4(m1+3,gpu_power); d4=vload4(m1+4,gpu_power);d5=vload4(m1+5,gpu_power);d6=vload4(m1+6,gpu_power);d7=vload4(m1+7,gpu_power); d8=vload4(m1+8,gpu_power);d9=vload4(m1+9,gpu_power);d10=vload4(m1+10,gpu_power);d11=vload4(m1+11,gpu_power); d12=vload4(m1+12,gpu_power);d13=vload4(m1+13,gpu_power);d14=vload4(m1+14,gpu_power);d15=vload4(m1+15,gpu_power); //R: coadd level 8 check if ( (d0.x>t.x)||(d0.y>t.x)||(d0.z>t.x)||(d0.w>t.x)||(d1.x>t.x)||(d1.y>t.x)||(d1.z>t.x)||(d1.w>t.x)|| (d2.x>t.x)||(d2.y>t.x)||(d2.z>t.x)||(d2.w>t.x)||(d3.x>t.x)||(d3.y>t.x)||(d3.z>t.x)||(d3.w>t.x)|| (d4.x>t.x)||(d4.y>t.x)||(d4.z>t.x)||(d4.w>t.x)||(d5.x>t.x)||(d5.y>t.x)||(d5.z>t.x)||(d5.w>t.x)|| (d6.x>t.x)||(d6.y>t.x)||(d6.z>t.x)||(d6.w>t.x)||(d7.x>t.x)||(d7.y>t.x)||(d7.z>t.x)||(d7.w>t.x)|| (d8.x>t.x)||(d8.y>t.x)||(d8.z>t.x)||(d8.w>t.x)||(d9.x>t.x)||(d9.y>t.x)||(d9.z>t.x)||(d9.w>t.x)|| (d10.x>t.x)||(d10.y>t.x)||(d10.z>t.x)||(d10.w>t.x)||(d11.x>t.x)||(d11.y>t.x)||(d11.z>t.x)||(d11.w>t.x)|| (d12.x>t.x)||(d12.y>t.x)||(d12.z>t.x)||(d12.w>t.x)||(d13.x>t.x)||(d13.y>t.x)||(d13.z>t.x)||(d13.w>t.x)|| (d14.x>t.x)||(d14.y>t.x)||(d14.z>t.x)||(d14.w>t.x)||(d15.x>t.x)||(d15.y>t.x)||(d15.z>t.x)||(d15.w>t.x) ){ was_pulse.x=1; } //R: now do coadd and save coadded piece in memory d0.xy=d0.xz+d0.yw;d0.zw=d1.xz+d1.yw;d1.xy=d2.xz+d2.yw;d1.zw=d3.xz+d3.yw; d2.xy=d4.xz+d4.yw;d2.zw=d5.xz+d5.yw;d3.xy=d6.xz+d6.yw;d3.zw=d7.xz+d7.yw; d4.xy=d8.xz+d8.yw;d4.zw=d9.xz+d9.yw;d5.xy=d10.xz+d10.yw;d5.zw=d11.xz+d11.yw; d6.xy=d12.xz+d12.yw;d6.zw=d13.xz+d13.yw;d7.xy=d14.xz+d14.yw;d7.zw=d15.xz+d15.yw; //R: coadd level 9 check,last needed coadd if ( (d0.x>t.y)||(d0.y>t.y)||(d0.z>t.y)||(d0.w>t.y)||(d1.x>t.y)||(d1.y>t.y)||(d1.z>t.y)||(d1.w>t.y)|| (d2.x>t.y)||(d2.y>t.y)||(d2.z>t.y)||(d2.w>t.y)||(d3.x>t.y)||(d3.y>t.y)||(d3.z>t.y)||(d3.w>t.y)|| (d4.x>t.y)||(d4.y>t.y)||(d4.z>t.y)||(d4.w>t.y)||(d5.x>t.y)||(d5.y>t.y)||(d5.z>t.y)||(d5.w>t.y)|| (d6.x>t.y)||(d6.y>t.y)||(d6.z>t.y)||(d6.w>t.y)||(d7.x>t.y)||(d7.y>t.y)||(d7.z>t.y)||(d7.w>t.y) ){ was_pulse.y=1; } result+=(was_pulse.x<<8)+(was_pulse.y<<9); //R: last we do is to write result of pulse finding results[32*dchunk+tid]=result; }

Outcomes