Hi Jans,
I checked your code and found out that the code contained a lot of complex bit-wise computations being done on same elements of an array without any synchronization barriers.Therfore the threads were executed in random order and random results generated each time.
#define IV_COUNT 8 #pragma OPENCL EXTENSION cl_amd_printf: enable #define LOCAL_SIZE 4 const int IVPOS[IV_COUNT] = {3,4,7,12,15,25,29,79}; __kernel void aida( __global unsigned long* membuff, const unsigned int TMAX, const unsigned int TMIN, __global unsigned long* random ) { const size_t gl_id = get_global_id(0); const size_t max_g = get_global_size(0); const size_t lid = get_local_id(0); const unsigned long MASK[6] = { 0xaaaaaaaaaaaaaaaa, 0xcccccccccccccccc, 0xf0f0f0f0f0f0f0f0, 0xff00ff00ff00ff00, 0xffff0000ffff0000, 0xffffffff00000000 }; int i,t; unsigned long lk[288]; lk[23] = 0xffffffffffffffffLL; lk[0] = 0xffffffffffffffffLL; lk[286] = 0xffffffffffffffffLL; lk[287] = 0xffffffffffffffffLL; int q066 = 66, q069 = 69, q091 = 91, q092 = 92, q093 = 93, q162 = 162, q171 = 171, q175 = 175, q176 = 176, q177 = 177, q243 = 243, q264 = 264, q286 = 286, q287 = 287, q288 = 0; for(i = 80; i <= 285; ++i) { lk = 0; } lk[96] = 0xaaaaaaaaaaaaaaaa; lk[97] = 0xcccccccccccccccc; lk[100] = 0xf0f0f0f0f0f0f0f0; lk[105] = 0xff00ff00ff00ff00; for(int i=0; i <80; ++i) { lk = random; } for(t = 1; t < TMIN; ++t) { lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; --q243; if(q243 < 0) q243 = 287; --q264; if(q264 < 0) q264 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; barrier(CLK_LOCAL_MEM_FENCE); } //for(int i=0;i<16;i++) //printf("Data[%d]=%d\n",i,membuff); for(t = TMIN; t < TMAX; ++t) { membuff[gl_id + (t - TMIN) * max_g] ^= (lk[q243] ^ lk[q288] ^ lk[q066] ^ lk[q093] ^ lk[q162] ^ lk[q177]); barrier(CLK_LOCAL_MEM_FENCE); lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); barrier(CLK_LOCAL_MEM_FENCE); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); barrier(CLK_LOCAL_MEM_FENCE); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); barrier(CLK_LOCAL_MEM_FENCE); --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; --q243; if(q243 < 0) q243 = 287; --q264; if(q264 < 0) q264 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; barrier(CLK_LOCAL_MEM_FENCE); } }