Archives Discussions

Meteorhead · ‎07-26-2010

Could some SDK developer verify, that the compiler is doing something very wrong in the kernel below.

I have mentioned earlier (in the "conditional operator behaviour on vector operand" topic but have not gotten an answer) that nested for cycles must have hardcoded lengths, otherwise the program won't compile. I have the same solution in this code, but that is not the problem.

For some strange reason, the comparison of random numbers to a preset constant give the same results all the time, although I have verified on host that random[0] is a nicely changing, good random vector all the time.

The comparison inside the loops is at the end of the code and the funcitonally same PRNG is at the beginning.

I have indicated in comments what line results correct behaviour and what line results in disastrous.

This is the first two integers of the output bitwise both right and wrong on different Multiprocessors:

Correct output:

Compute unit #0 produced:
00001100111111110000110011111111
11001100000000111100110000000011
Compute unit #1 produced:
11001111000011001100111100001100
00111100111111000011110011111100
Compute unit #2 produced:
11111111000011001111111100001100
00000000110011110000000011001111
Compute unit #3 produced:
11000011001100001100001100110000
11000000001111001100000000111100

Incorrect output:

Compute unit #0 produced:
00000000000000000000000000000000
11111111111111111111111111111111
Compute unit #1 produced:
11111111111111111111111111111111
00000000000000000000000000000000
Compute unit #2 produced:
11111111111111111111111111111111
00000000000000000000000000000000
Compute unit #3 produced:
11111111111111111111111111111111
11111111111111111111111111111111

Could someone verify that it's not my mistake it's working like this?

// Vector type shifting with carryover between elements void lshift128(uint4 input, uint shift, uint4* output) { unsigned int invshift = 32u - shift; uint4 temp; temp.x = input.x << shift; temp.y = (input.y << shift) | (input.x >> invshift); temp.z = (input.z << shift) | (input.y >> invshift); temp.w = (input.w << shift) | (input.z >> invshift); *output = temp; } void rshift128(uint4 input, uint shift, uint4* output) { unsigned int invshift = 32u - shift; uint4 temp; temp.w = input.w >> shift; temp.z = (input.z >> shift) | (input.w << invshift); temp.y = (input.y >> shift) | (input.z << invshift); temp.x = (input.x >> shift) | (input.y << invshift); *output = temp; } __constant unsigned int thirteen = 13u; __constant unsigned int fifteen = 15u; __constant unsigned int shift = 8u * 3u; __constant unsigned int mask11 = 0xfdff37ffu; __constant unsigned int mask12 = 0xef7f3f7du; __constant unsigned int mask13 = 0xff777b7du; __constant unsigned int mask14 = 0x7ff7fb2fu; __constant float one = 1.0f; __constant float intMax = 4294967296.0f; inline void rand(uint4* r1, uint4* r2, uint4* a, uint4* b, uint4* e, uint4* f, uint4* state1, uint4* state2, uint4* state3, uint4* state4, uint4* state5, uint4* temp, uint howmany, float4* out) { uint i = 0; for (i = 0; i < howmany ; ++i) { switch(i) { case 0: *r1 = *state4; *r2 = *state5; *a = *state1; *b = *state3; break; case 1: *r1 = *r2; *r2 = temp[0]; *a = *state2; *b = *state4; break; case 2: *r1 = *r2; *r2 = temp[1]; *a = *state3; *b = *state5; break; case 3: *r1 = *r2; *r2 = temp[2]; *a = *state4; *b = *state1; break; case 4: *r1 = *r2; *r2 = temp[3]; *a = *state5; *b = *state2; break; case 5: *r1 = *r2; *r2 = temp[4]; *a = temp[0]; *b = temp[2]; break; case 6: *r1 = *r2; *r2 = temp[5]; *a = temp[1]; *b = temp[3]; break; case 7: *r1 = *r2; *r2 = temp[6]; *a = temp[2]; *b = temp[4]; break; default: break; } lshift128(*a, shift, e); rshift128(*r1, shift, f); temp.x = (*a).x ^ (*e).x ^ (((*b).x >> thirteen) & mask11) ^ (*f).x ^ ((*r2).x << fifteen); temp.y = (*a).y ^ (*e).y ^ (((*b).y >> thirteen) & mask12) ^ (*f).y ^ ((*r2).y << fifteen); temp.z = (*a).z ^ (*e).z ^ (((*b).z >> thirteen) & mask13) ^ (*f).z ^ ((*r2).z << fifteen); temp.w = (*a).w ^ (*e).w ^ (((*b).w >> thirteen) & mask14) ^ (*f).w ^ ((*r2).w << fifteen); } for (i = 0 ; i < howmany ; ++i) out = convert_float4(temp) * one / intMax; } // rand // Chainlength is the number of vectors __kernel void test( __global uint4* seedArray, __global uint4* particles, const uint chainlength, __local uint4* share, __global uint4* partial, const uint timestep, const uint vectors_per_thread, float4 p, float4 q, __global float4* r_p, __global float4* r_q, __local float4* rp, __local float4* rq) { // Global init int gidX = get_global_id(0); int tidX = get_local_id(0); int lsiX = get_local_size(0); int GIDX = get_group_id(0); // KPZ init //const uint vectors_per_thread = chainlength / lsiX; uint4 XOR = (uint4)(0); uint4 factor = (uint4)(0); uint4 zero4 = (uint4)(0); uint my_share; // PRNG init uint4 temp[8]; uint4 state1 = seedArray[GIDX * lsiX + tidX]; uint4 state2 = (uint4)(0); uint4 state3 = (uint4)(0); uint4 state4 = (uint4)(0); uint4 state5 = (uint4)(0); uint stateMask = 1812433253u; uint thirty = 30u; uint4 mask4 = (uint4)(stateMask); uint4 thirty4 = (uint4)(thirty); uint4 one4 = (uint4)(1u); uint4 two4 = (uint4)(2u); uint4 three4 = (uint4)(3u); uint4 four4 = (uint4)(4u); uint4 r1 = (uint4)(0); uint4 r2 = (uint4)(0); uint4 a = (uint4)(0); uint4 b = (uint4)(0); uint4 e = (uint4)(0); uint4 f = (uint4)(0); //Initializing states. state2 = mask4 * (state1 ^ (state1 >> thirty4)) + one4; state3 = mask4 * (state2 ^ (state2 >> thirty4)) + two4; state4 = mask4 * (state3 ^ (state3 >> thirty4)) + three4; state5 = mask4 * (state4 ^ (state4 >> thirty4)) + four4; float4 random[8]; for (uint t = 0 ; t < timestep ; ++t) { for (uint v = 0 ; v < 1u ; ++v) { my_share = tidX * vectors_per_thread + v; particle = share[my_share]; factor = zero4; XOR = zero4; for (uint i = 0 ; i < 32u ; i += 2) { rand(&r1, &r2, &a, &b, &e, &f, &state1, &state2, &state3, &state4, &state5, temp, 8u, random); factor = select(zero4, three4, random[(i/2)%8] < p); //Works factor = select(zero4, three4, random[0] < p); //Doesn't work XOR = XOR << two4; XOR |= factor; } particle ^= XOR; output[my_share] = XOR; } } for (uint v = 0 ; v < vectors_per_thread ; ++v) { partial[GIDX * chainlength + tidX * vectors_per_thread + v] = share[tidX * vectors_per_thread + v];}

omkaranathan · ‎08-06-2010

Meteorhead,

Its difficult to identify the problem just by going through the kernel code, in case of big kernels. It will be easy to reproduce and track down the problem if you provide a compilable test-case.

Archives Discussions

Major compiler mistake?