Raistmer

Why so different GPR requirements ?

Discussion created by Raistmer on Oct 25, 2009
Latest reply on Oct 29, 2009 by Jawed
2 kernels listed

First of attached kernels reqires (accordingly to SKA ) only 7 registers while second one - 18 !

But there is no change in directly declared registers number.
Why second one requires so many registers ?

kernel void GPU_coadd_and_compare_kernel54t_s8(float4 src[][], int thresholds[][],int level,out float4 dest<>, out float4 dest1<>,out int4 output<>,out int4 output1<> ) { int threadID=instance().x; float threshold=(float)thresholds[level][threadID]; int was_signal=0; int bin=0; int4 s; float4 i1,i2; float4 o11; float4 o21; i1=src[0][threadID]; i2=src[1][threadID]; if(i2.w>=threshold){ was_signal++; bin=7; } if(i2.z>=threshold){ was_signal++; bin=6; } if(i2.y>=threshold){ was_signal++; bin=5; } if(i2.x>=threshold){ was_signal++; bin=4; } if(i1.w>=threshold){ was_signal++; bin=3; } if(i1.z>=threshold){ was_signal++; bin=2; } if(i1.y>=threshold){ was_signal++; bin=1; } if(i1.x>=threshold){ was_signal++; bin=0; } s.x=was_signal; s.y=bin; o11.xy=i1.xz+i1.yw; o11.zw=i2.xz+i2.yw; dest=o11; was_signal=0; threshold=(float)thresholds[level+1][threadID]; if(o11.w>=threshold){ was_signal++; bin=3; } if(o11.z>=threshold){ was_signal++; bin=2; } if(o11.y>=threshold){ was_signal++; bin=1; } if(o11.x>=threshold){ was_signal++; bin=0; } threshold=(float)thresholds[level+2][threadID]; s.z=was_signal; s.w=bin; output=s; o21.xy=o11.xz+o11.yw; dest1=o21; was_signal=0; if(o21.y>=threshold){ was_signal++; bin=1; } if(o21.x>=threshold){ was_signal++; bin=0; } s.x=was_signal; s.y=bin; output1=s; } kernel void GPU_strideadd_and_compare_kernel54t_s8(float4 src[][], int thresholds[][],int level, out float4 dest<>/*s4*/, out float4 dest1<>/*s2*/, out float4 d2<>/*s4folded*/,out float4 d3<>/*s2afters4fold*/, out int4 output<>/*s8&s4*/,out int4 output1<>/*s2&s4folded*/,out int4 output2<>/*s2after fold*/) { int threadID=instance().x; float4 threshold; int was_signal=0; int bin=0; int4 s; float4 i1,i2; float4 o11; float4 o21; threshold.x=(float)thresholds[level][threadID]; threshold.y=(float)thresholds[level+1][threadID]; threshold.z=(float)thresholds[level+2][threadID]; i1=src[0][threadID]; i2=src[1][threadID]; if(i2.w>=threshold.x){ was_signal++; bin=7; } if(i2.z>=threshold.x){ was_signal++; bin=6; } if(i2.y>=threshold.x){ was_signal++; bin=5; } if(i2.x>=threshold.x){ was_signal++; bin=4; } if(i1.w>=threshold.x){ was_signal++; bin=3; } if(i1.z>=threshold.x){ was_signal++; bin=2; } if(i1.y>=threshold.x){ was_signal++; bin=1; } if(i1.x>=threshold.x){ was_signal++; bin=0; } s.x=was_signal; s.y=bin; o11.xy=i1.xz+i1.yw; o11.zw=i2.xz+i2.yw; dest=o11; was_signal=0; if(o11.w>=threshold.y){ was_signal++; bin=3; } if(o11.z>=threshold.y){ was_signal++; bin=2; } if(o11.y>=threshold.y){ was_signal++; bin=1; } if(o11.x>=threshold.y){ was_signal++; bin=0; } s.z=was_signal; s.w=bin; output=s; o21.xy=o11.xz+o11.yw; dest1=o21; was_signal=0; if(o21.y>=threshold.z){ was_signal++; bin=1; } if(o21.x>=threshold.z){ was_signal++; bin=0; } s.x=was_signal; s.y=bin; //S8 coadd finished, now do stride add and repeat coadd for S4 was_signal=0; i1.xyzw=i1.xyzw+i2.xyzw;//fold S8 to S4 d2=i1; if(i1.w>=threshold.y){ was_signal++; bin=3; } if(i1.z>=threshold.y){ was_signal++; bin=2; } if(i1.y>=threshold.y){ was_signal++; bin=1; } if(i1.x>=threshold.y){ was_signal++; bin=0; } s.z=was_signal; s.w=bin; output1=s; //now coadd was_signal=0; o11.xy=i1.xz+i1.yw; d3=o11; if(o11.y>=threshold.z){ was_signal++; bin=1; } if(o11.x>=threshold.z){ was_signal++; bin=0; } s.x=was_signal; s.y=bin; output2=s; }

Outcomes