1 Reply Latest reply on Oct 15, 2010 4:44 AM by himanshu.gautam

    Different Results on multiple Runs, only the last few Bits differ

    JanS

      Hi,

      my program varies between 2 different results everytime i start it. It got different results on GPU & CPU either!

      I'm using 10.7b, Windows 7 64bit and SDK 2.2. The problem existed on 2.1 too.

      The result is computed out of an array of 175 unsigned longs, which seem to differ only at the last few bits.

      Both results are CPU only. It doesn't matter how often i start it, it varies between these two results:

      Starting kernel...

       

      [0] 4283270082281354098

       

      vs

      [0] 4283270082281382770

       

      [1] 10912951361200691065

      vs

      [1] 10912951361200006513

       

      [2] 10790209076973444528

      vs

      [2] 10790209076973450672

       

      [3] 2593268536612269050

      vs

      [3] 2593268536611711474

       

      [4] 16457265885716341828

      vs

      [4] 16457265885716216900

       

       

      Whats happening here? Do i got an Error in my program?

      Maybe its because of the floating point accuracy?

      The complete source code, kernel and .exe is here:

      http://www.jevermeister.de/not_working.zip

      If you want to use your CPU, just start it with an random command line argument. 

       

      Thanks!

        • Different Results on multiple Runs, only the last few Bits differ
          himanshu.gautam

          Hi Jans,

          I checked your code and found out that the code contained a lot of complex bit-wise computations being done on same elements of an array without any synchronization barriers.Therfore the threads were executed in random order and random results generated each time.

           

          #define IV_COUNT 8 #pragma OPENCL EXTENSION cl_amd_printf: enable #define LOCAL_SIZE 4 const int IVPOS[IV_COUNT] = {3,4,7,12,15,25,29,79}; __kernel void aida( __global unsigned long* membuff, const unsigned int TMAX, const unsigned int TMIN, __global unsigned long* random ) { const size_t gl_id = get_global_id(0); const size_t max_g = get_global_size(0); const size_t lid = get_local_id(0); const unsigned long MASK[6] = { 0xaaaaaaaaaaaaaaaa, 0xcccccccccccccccc, 0xf0f0f0f0f0f0f0f0, 0xff00ff00ff00ff00, 0xffff0000ffff0000, 0xffffffff00000000 }; int i,t; unsigned long lk[288]; lk[23] = 0xffffffffffffffffLL; lk[0] = 0xffffffffffffffffLL; lk[286] = 0xffffffffffffffffLL; lk[287] = 0xffffffffffffffffLL; int q066 = 66, q069 = 69, q091 = 91, q092 = 92, q093 = 93, q162 = 162, q171 = 171, q175 = 175, q176 = 176, q177 = 177, q243 = 243, q264 = 264, q286 = 286, q287 = 287, q288 = 0; for(i = 80; i <= 285; ++i) { lk[i] = 0; } lk[96] = 0xaaaaaaaaaaaaaaaa; lk[97] = 0xcccccccccccccccc; lk[100] = 0xf0f0f0f0f0f0f0f0; lk[105] = 0xff00ff00ff00ff00; for(int i=0; i <80; ++i) { lk[i] = random[i]; } for(t = 1; t < TMIN; ++t) { lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; --q243; if(q243 < 0) q243 = 287; --q264; if(q264 < 0) q264 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; barrier(CLK_LOCAL_MEM_FENCE); } //for(int i=0;i<16;i++) //printf("Data[%d]=%d\n",i,membuff[i]); for(t = TMIN; t < TMAX; ++t) { membuff[gl_id + (t - TMIN) * max_g] ^= (lk[q243] ^ lk[q288] ^ lk[q066] ^ lk[q093] ^ lk[q162] ^ lk[q177]); barrier(CLK_LOCAL_MEM_FENCE); lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); barrier(CLK_LOCAL_MEM_FENCE); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); barrier(CLK_LOCAL_MEM_FENCE); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); barrier(CLK_LOCAL_MEM_FENCE); --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; --q243; if(q243 < 0) q243 = 287; --q264; if(q264 < 0) q264 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; barrier(CLK_LOCAL_MEM_FENCE); } }