And will be reduction kernel faster if yes?
it scans each 1D subarray of big 2D array and counts found signals (those that bigger than threshold value)
kernel void GPU_compare_with_threshold_kernel84t(int size,float4 input[][], int thresholds[][],int level,out int2 output<>) { int threadID=instance().x; float threshold=(float)thresholds[threadID][level]; float4 p; int i=size-1; int was_signal=0; int bin=0; //int ln=(len[threadID]+3)>>2; for(;i>=0;i--){ p=input[threadID]; if(p.w>=threshold){ was_signal++; bin=4*i+3; } if(p.z>=threshold){ was_signal++; bin=4*i+2; } if(p.y>=threshold){ was_signal++; bin=4*i+1; } if(p.x>=threshold){ was_signal++; bin=4*i; } } output.x=was_signal; output.y=bin; }