I have tried out several things.

One that nou has suggested, to limit work-group size. By default the kernel would like to run in a group of 256, limiting this to 128. This way there were no SR, but the same register number was required (82), but this time no SR usage was needed. (58+24) -->> (82+0)

Second try was to place defines instead of __constant variables, however this seemed futile also, it seemed that memory needs to be allocated for every single compile time constant, none could be placed into code. Same register usage was imposed.

I read through th code, and I could only account for roughly 50-56 registers. This also includes variables coming as constant kernel arguments. Taking that some operations require additional memory to hold temporary values, I could imagine 10-12 more registers. However 82 is still 130% of the register usage I would have in mind in a kernel looking like this.

I am using Win7 with SDK 2.3, Catalyst 11.1 and Mobility 5870 as device. See attached code for particular test case.

**Edit**: Nou, unfortunately putting brackets will not help in my case, becuase all the __constant variables are used by the PRNG all throughout the program. Or to be precise, I need them to be global, because I wanted the rndm() function to be defined outside the main. Declaring all those variables inside the PRNG would make it possible to free them from time to time, but rndm() is called very often and allocating and freeing memory so many times would be inefficient. I am worried much more about that extra 30% of GPR usage that I cannot seem to be able to account for.

// Vector type shifting with carryover between elements void lshift(uint4 input, uint shift, uint4* output) { unsigned int invshift = 32u - shift; uint4 temp; temp.w = input.w << shift; temp.z = (input.z << shift) | (input.w >> invshift); temp.y = (input.y << shift) | (input.z >> invshift); temp.x = (input.x << shift) | (input.y >> invshift); *output = temp; } void rshift(uint4 input, uint shift, uint4* output) { unsigned int invshift = 32u - shift; uint4 temp; temp.x = input.x >> shift; temp.y = (input.y >> shift) | (input.x << invshift); temp.z = (input.z >> shift) | (input.y << invshift); temp.w = (input.w >> shift) | (input.z << invshift); *output = temp; } void lshift128(uint4 input, uint shift, uint4* output) { unsigned int invshift = 32u - shift; uint4 temp; temp.x = input.x << shift; temp.y = (input.y << shift) | (input.x >> invshift); temp.z = (input.z << shift) | (input.y >> invshift); temp.w = (input.w << shift) | (input.z >> invshift); *output = temp; } void rshift128(uint4 input, uint shift, uint4* output) { unsigned int invshift = 32u - shift; uint4 temp; temp.w = input.w >> shift; temp.z = (input.z >> shift) | (input.w << invshift); temp.y = (input.y >> shift) | (input.z << invshift); temp.x = (input.x >> shift) | (input.y << invshift); *output = temp; } __constant uint stateMask = 1812433253u; __constant uint thirty = 30u; __constant uint4 mask4 = (uint4)(1812433253u); __constant uint4 thirty4 = (uint4)(30u); __constant uint4 one4 = (uint4)(1u); __constant uint4 two4 = (uint4)(2u); __constant uint4 three4 = (uint4)(3u); __constant uint4 four4 = (uint4)(4u); __constant unsigned int thirteen = 13u; __constant unsigned int fifteen = 15u; __constant unsigned int shift = 8u * 3u; __constant unsigned int mask11 = 0xfdff37ffu; __constant unsigned int mask12 = 0xef7f3f7du; __constant unsigned int mask13 = 0xff777b7du; __constant unsigned int mask14 = 0x7ff7fb2fu; __constant float one = 1.0f; __constant float intMax = 4294967296.0f; void rand(uint4* r1, uint4* r2, uint4* a, uint4* b, uint4* e, uint4* f, uint4* state1, uint4* state2, uint4* state3, uint4* state4, uint4* state5, uint4* temp, float4* out) { uint i = 0; for (i = 0; i < 8u ; ++i) { switch(i) { case 0: *r1 = *state4; *r2 = *state5; *a = *state1; *b = *state3; break; case 1: *r1 = *r2; *r2 = temp[0]; *a = *state2; *b = *state4; break; case 2: *r1 = *r2; *r2 = temp[1]; *a = *state3; *b = *state5; break; case 3: *r1 = *r2; *r2 = temp[2]; *a = *state4; *b = *state1; break; case 4: *r1 = *r2; *r2 = temp[3]; *a = *state5; *b = *state2; break; case 5: *r1 = *r2; *r2 = temp[4]; *a = temp[0]; *b = temp[2]; break; case 6: *r1 = *r2; *r2 = temp[5]; *a = temp[1]; *b = temp[3]; break; case 7: *r1 = *r2; *r2 = temp[6]; *a = temp[2]; *b = temp[4]; break; default: break; } lshift128(*a, shift, e); rshift128(*r1, shift, f); temp[i].x = (*a).x ^ (*e).x ^ (((*b).x >> thirteen) & mask11) ^ (*f).x ^ ((*r2).x << fifteen); temp[i].y = (*a).y ^ (*e).y ^ (((*b).y >> thirteen) & mask12) ^ (*f).y ^ ((*r2).y << fifteen); temp[i].z = (*a).z ^ (*e).z ^ (((*b).z >> thirteen) & mask13) ^ (*f).z ^ ((*r2).z << fifteen); temp[i].w = (*a).w ^ (*e).w ^ (((*b).w >> thirteen) & mask14) ^ (*f).w ^ ((*r2).w << fifteen); out[i] = convert_float4(temp[i]) * one / intMax; } // Re-initialize states *state1 = temp[6]; *state2 = mask4 * (*state1 ^ (*state1 >> thirty4)) + one4; *state3 = mask4 * (*state2 ^ (*state2 >> thirty4)) + two4; *state4 = mask4 * (*state3 ^ (*state3 >> thirty4)) + three4; *state5 = mask4 * (*state4 ^ (*state4 >> thirty4)) + four4; for (i = 8; i < 16u ; ++i) { switch(i) { case 8: *r1 = *state4; *r2 = *state5; *a = *state1; *b = *state3; break; case 9: *r1 = *r2; *r2 = temp[0]; *a = *state2; *b = *state4; break; case 10: *r1 = *r2; *r2 = temp[1]; *a = *state3; *b = *state5; break; case 11: *r1 = *r2; *r2 = temp[2]; *a = *state4; *b = *state1; break; case 12: *r1 = *r2; *r2 = temp[3]; *a = *state5; *b = *state2; break; case 13: *r1 = *r2; *r2 = temp[4]; *a = temp[0]; *b = temp[2]; break; case 14: *r1 = *r2; *r2 = temp[5]; *a = temp[1]; *b = temp[3]; break; case 15: *r1 = *r2; *r2 = temp[6]; *a = temp[2]; *b = temp[4]; break; default: break; } lshift128(*a, shift, e); rshift128(*r1, shift, f); temp[i%8].x = (*a).x ^ (*e).x ^ (((*b).x >> thirteen) & mask11) ^ (*f).x ^ ((*r2).x << fifteen); temp[i%8].y = (*a).y ^ (*e).y ^ (((*b).y >> thirteen) & mask12) ^ (*f).y ^ ((*r2).y << fifteen); temp[i%8].z = (*a).z ^ (*e).z ^ (((*b).z >> thirteen) & mask13) ^ (*f).z ^ ((*r2).z << fifteen); temp[i%8].w = (*a).w ^ (*e).w ^ (((*b).w >> thirteen) & mask14) ^ (*f).w ^ ((*r2).w << fifteen); out[i] = convert_float4(temp[i%8]) * one / intMax; } // Re-initialize states *state1 = temp[6]; *state2 = mask4 * (*state1 ^ (*state1 >> thirty4)) + one4; *state3 = mask4 * (*state2 ^ (*state2 >> thirty4)) + two4; *state4 = mask4 * (*state3 ^ (*state3 >> thirty4)) + three4; *state5 = mask4 * (*state4 ^ (*state4 >> thirty4)) + four4; } // rand // Chainlength is the number of vectors __kernel void test( __global uint4* seedArray, __global uint4* initial, const uint chainlength, __local uint4* share, __global uint4* partial, const uint timestep, const uint vectors_per_thread, const float4 P, const float4 Q, const float4 p, const float4 q, __global uint4* r_p, __global uint4* r_q, __local uint4* rp, __local uint4* rq, const uint resolution, __global int4* height, __global int4* T, __global float4* S, __global float* W) { // Global init int gidX = get_global_id(0); int tidX = get_local_id(0); int lsiX = get_local_size(0); int GIDX = get_group_id(0); // KPZ init uint4 XOR = (uint4)(0); uint4 factor = (uint4)(0); uint4 zero4 = (uint4)(0); uint4 particle; uint4 shifted; uint4 redp; uint4 redq; uint4 reducedp; uint4 reducedq; uint my_share; uint survive; // PRNG init uint4 temp[8]; uint4 state1 = seedArray[GIDX * lsiX + tidX]; uint4 state2 = (uint4)(0); uint4 state3 = (uint4)(0); uint4 state4 = (uint4)(0); uint4 state5 = (uint4)(0); uint4 r1 = (uint4)(0); uint4 r2 = (uint4)(0); uint4 a = (uint4)(0); uint4 b = (uint4)(0); uint4 e = (uint4)(0); uint4 f = (uint4)(0); //Initializing states. state2 = mask4 * (state1 ^ (state1 >> thirty4)) + one4; state3 = mask4 * (state2 ^ (state2 >> thirty4)) + two4; state4 = mask4 * (state3 ^ (state3 >> thirty4)) + three4; state5 = mask4 * (state4 ^ (state4 >> thirty4)) + four4; float4 random[16]; barrier(CLK_GLOBAL_MEM_FENCE); // All threads wait for each other after init for (uint t = 0 ; t < timestep ; ++t) { for (uint v = 0 ; v < vectors_per_thread ; ++v) { my_share = tidX * vectors_per_thread + v; prefetch(&partial[GIDX * chainlength + my_share + 1], sizeof(uint4)); prefetch(&r_p[my_share + 1], sizeof(uint4)); prefetch(&r_p[my_share + 1], sizeof(uint4)); particle = partial[GIDX * chainlength + my_share]; redp = r_p[my_share]; redq = r_q[my_share]; factor = zero4; XOR = zero4; rand(&r1, &r2, &a, &b, &e, &f, &state1, &state2, &state3, &state4, &state5, temp, random); // Odd timestep for (uint i = 0 ; i < 16u ; ++i) { shifted = ((particle << (uint4)(i*2u)) >> thirty4 ); reducedp = (redp << (uint4)(i*2u)) >> (uint4)(31u); reducedq = (redq << (uint4)(i*2u+1)) >> (uint4)(31u); factor = select(zero4, select(zero4, three4, random[i] < select(P, p, reducedp == one4 )), shifted == two4 ); factor |= select(zero4, select(zero4, three4, random[i] < select(Q, q, reducedq == one4 )), shifted == one4 ); XOR = XOR << two4; XOR |= factor; } particle ^= XOR; partial[GIDX * chainlength + my_share] = particle; } barrier(CLK_GLOBAL_MEM_FENCE); for (uint v = 0 ; v < vectors_per_thread ; ++v) { // Even timestep my_share = tidX * vectors_per_thread + v; prefetch(&partial[GIDX * chainlength + my_share + 1], 2 * sizeof(uint4)); prefetch(&r_p[my_share + 1], 2 * sizeof(uint4)); prefetch(&r_p[my_share + 1], 2 * sizeof(uint4)); particle = partial[GIDX * chainlength + my_share]; survive = particle.x; // survive used to save leftmost bit that is shifted out. lshift(particle, 1u, &particle); factor = partial[GIDX * chainlength + ((my_share + 1) % chainlength)]; // factor used as temp particle.w |= factor.x >> 31u; redp = r_p[my_share]; lshift(redp, 1u, &redp); redp.w |= r_p[(my_share + 1) % chainlength].x >> 31u; redq = r_q[my_share]; lshift(redq, 1u, &redq); redq.w |= r_q[(my_share + 1) % chainlength].x >> 31u; mem_fence(CLK_GLOBAL_MEM_FENCE); factor = zero4; XOR = zero4; rand(&r1, &r2, &a, &b, &e, &f, &state1, &state2, &state3, &state4, &state5, temp, random); for (uint i = 0 ; i < 16 ; ++i) { shifted = ((particle << (uint4)(i*2u)) >> thirty4 ); reducedp = (redp << (uint4)(i*2u)) >> (uint4)(31u); reducedq = (redq << (uint4)(i*2u+1)) >> (uint4)(31u); factor = select(zero4, select(zero4, three4, random[i] < select(P, p, reducedp == one4 )), shifted == two4 ); factor |= select(zero4, select(zero4, three4, random[i] < select(Q, q, reducedq == one4 )), shifted == one4 ); XOR = XOR << two4; XOR |= factor; } particle ^= XOR; factor = particle; // factor being used as temp to save rightmost bit rshift(particle, 1u, &particle); particle.x |= (survive >> 31u) << 31u; // keep first bit as we found it, survive used barrier(CLK_GLOBAL_MEM_FENCE); partial[GIDX * chainlength + my_share] = particle; barrier(CLK_GLOBAL_MEM_FENCE); survive = partial[GIDX * chainlength + ((my_share + 1) % chainlength)].x; // survive used as temp again for other purpose survive = ((survive << 1u) >> 1u) | (factor.w << 31u); // merge neighbours first vector component with bit to be written into it partial[GIDX * chainlength + ((my_share + 1) % chainlength)].x = survive; // make actual copy into neighbour barrier(CLK_GLOBAL_MEM_FENCE); } // vectors_per_thread barrier(CLK_GLOBAL_MEM_FENCE); } // timestep barrier(CLK_GLOBAL_MEM_FENCE); seedArray[GIDX * lsiX + tidX] = state1; // Save state of PRNG for next kernel call } // main