reducing scratch register usage

Discussion created by Meteorhead on Feb 28, 2011
Latest reply on Apr 4, 2011 by Meteorhead


My question would be first of all how scratch registers are allocated by the HW? The Scratch Reg. usage that SKA indicates seems really high. (58 GPR, 24 SR) Is the SR number indicated here the highest number that is allocated at sometime during execution? This is all the memory allocation in the code (and global constants which in my mind should be compiled into the code and not take up registers). I think this many memory alloc does not justify the usage of 58 Registers (which is 58 vectors4 of any 32-bit datatype).


__constant uint stateMask = 1812433253u; __constant uint thirty = 30u; __constant uint4 mask4 = (uint4)(1812433253u); __constant uint4 thirty4 = (uint4)(30u); __constant uint4 one4 = (uint4)(1u); __constant uint4 two4 = (uint4)(2u); __constant uint4 three4 = (uint4)(3u); __constant uint4 four4 = (uint4)(4u); __constant uint4 zero4 = (uint4)(0u); __constant unsigned int thirteen = 13u; __constant unsigned int fifteen = 15u; __constant unsigned int shift = 8u * 3u; __constant unsigned int mask11 = 0xfdff37ffu; __constant unsigned int mask12 = 0xef7f3f7du; __constant unsigned int mask13 = 0xff777b7du; __constant unsigned int mask14 = 0x7ff7fb2fu; __constant float one = 1.0f; __constant float intMax = 4294967296.0f; // Chainlength is the number of vectors __kernel void main( __global uint4* seedArray, __global uint4* initial, const uint chainlength, __local uint4* share, __global uint4* partial, const uint timestep, const uint vectors_per_thread, const float4 P, const float4 Q, const float4 p, const float4 q, __global uint4* r_p, __global uint4* r_q, __local uint4* rp, __local uint4* rq, const uint resolution, __global int4* height, __global int4* T, __global float4* S, __global float* W) { // Global init int gidX = get_global_id(0); int tidX = get_local_id(0); int lsiX = get_local_size(0); int GIDX = get_group_id(0); // KPZ init uint4 XOR = (uint4)(0); uint4 factor = (uint4)(0); uint4 particle; uint4 shifted; uint4 redp; uint4 redq; uint4 reducedp; uint4 reducedq; uint my_share; uint survive; // PRNG init uint4 temp[8]; uint4 state1 = seedArray[GIDX * lsiX + tidX]; uint4 state2 = (uint4)(0); uint4 state3 = (uint4)(0); uint4 state4 = (uint4)(0); uint4 state5 = (uint4)(0); uint4 r1 = (uint4)(0); uint4 r2 = (uint4)(0); uint4 a = (uint4)(0); uint4 b = (uint4)(0); uint4 e = (uint4)(0); uint4 f = (uint4)(0); float4 random[16]; } // main