10 Replies Latest reply on Apr 5, 2010 11:27 PM by MicahVillmow

    Different results on CPU and GPU

    Wilq

      Hi,

      I'm trying to get some of my code to work in OpenCL, but I just can't get past some strange results I'm getting.

      When I pass CL_DEVICE_TYPE_CPU as first argument to context, I get completely different results from my code than when using CL_DEVICE_TYPE_GPU.

      I've attached my code under the post.

      I'm using MSVS 08 to compile it.

      My GPU is ATI Radeon HD5770 on Windows 7 x64 with Catalyst 10.03.

      Can anyone help me with this problem? I would really appreciate it.

      // kernel.cl __constant uchar SBOX[256] = { 0x9E ,0xBC ,0xC3 ,0x82 ,0xA2 ,0x7E ,0x41 ,0x5A ,0x51 ,0x36 ,0x3F ,0xAC ,0xE3 ,0x68 ,0x2D ,0x2A, 0xEB ,0x9B ,0x1B ,0x35 ,0xDC ,0x1E ,0x56 ,0xA5 ,0xB2 ,0x74 ,0x34 ,0x12 ,0xD5 ,0x64 ,0x15 ,0xDD, 0xB6 ,0x4B ,0x8E ,0xFB ,0xCE ,0xE9 ,0xD9 ,0xA1 ,0x6E ,0xDB ,0x0F ,0x2C ,0x2B ,0x0E ,0x91 ,0xF1, 0x59 ,0xD7 ,0x3A ,0xF4 ,0x1A ,0x13 ,0x09 ,0x50 ,0xA9 ,0x63 ,0x32 ,0xF5 ,0xC9 ,0xCC ,0xAD ,0x0A, 0x5B ,0x06 ,0xE6 ,0xF7 ,0x47 ,0xBF ,0xBE ,0x44 ,0x67 ,0x7B ,0xB7 ,0x21 ,0xAF ,0x53 ,0x93 ,0xFF, 0x37 ,0x08 ,0xAE ,0x4D ,0xC4 ,0xD1 ,0x16 ,0xA4 ,0xD6 ,0x30 ,0x07 ,0x40 ,0x8B ,0x9D ,0xBB ,0x8C, 0xEF ,0x81 ,0xA8 ,0x39 ,0x1D ,0xD4 ,0x7A ,0x48 ,0x0D ,0xE2 ,0xCA ,0xB0 ,0xC7 ,0xDE ,0x28 ,0xDA, 0x97 ,0xD2 ,0xF2 ,0x84 ,0x19 ,0xB3 ,0xB9 ,0x87 ,0xA7 ,0xE4 ,0x66 ,0x49 ,0x95 ,0x99 ,0x05 ,0xA3, 0xEE ,0x61 ,0x03 ,0xC2 ,0x73 ,0xF3 ,0xB8 ,0x77 ,0xE0 ,0xF8 ,0x9C ,0x5C ,0x5F ,0xBA ,0x22 ,0xFA, 0xF0 ,0x2E ,0xFE ,0x4E ,0x98 ,0x7C ,0xD3 ,0x70 ,0x94 ,0x7D ,0xEA ,0x11 ,0x8A ,0x5D ,0x00 ,0xEC, 0xD8 ,0x27 ,0x04 ,0x7F ,0x57 ,0x17 ,0xE5 ,0x78 ,0x62 ,0x38 ,0xAB ,0xAA ,0x0B ,0x3E ,0x52 ,0x4C, 0x6B ,0xCB ,0x18 ,0x75 ,0xC0 ,0xFD ,0x20 ,0x4A ,0x86 ,0x76 ,0x8D ,0x5E ,0x01 ,0xED ,0x46 ,0x45, 0xB4 ,0xFC ,0x83 ,0x02 ,0x54 ,0xD0 ,0xDF ,0x6C ,0xCD ,0x3C ,0x6A ,0xB1 ,0x3D ,0xC8 ,0x24 ,0xE8, 0xC5 ,0x55 ,0x71 ,0x96 ,0x65 ,0x1C ,0x58 ,0x31 ,0xA0 ,0x26 ,0x6F ,0x29 ,0x14 ,0x1F ,0x6D ,0xC6, 0x88 ,0xF9 ,0x69 ,0x0C ,0x79 ,0xA6 ,0x42 ,0xF6 ,0xCF ,0x25 ,0x9A ,0x10 ,0x9F ,0xBD ,0x80 ,0x60, 0x90 ,0x2F ,0x72 ,0x85 ,0x33 ,0x3B ,0xE7 ,0x43 ,0x89 ,0xE1 ,0x8F ,0x23 ,0xC1 ,0xB5 ,0x92 ,0x4F }; uchar obliczk2plus(uchar m1, uchar c1, uchar k1) { uchar d; d=m1+k1; d=SBOX[d]; d=d-c1; return d; } uchar szyfrujplus(uchar m, uchar k1, uchar k2) { uchar d; d=m+k1; d=SBOX[d]; d=d-k2; return d; } __kernel void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { //uint ic1 = get_global_id(0); uint im2 = get_global_id(0); uint ic2 = get_global_id(1); if(im2>=im1+1) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } tabl[licznik+im2*8+ic2*8*256]++; if(licznik>*tablmax)*tablmax=licznik; } } } //run.cpp void run(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context) { unsigned int im1,im2,ic1,ic2; unsigned int l,l2; cl_uint* tabl = new cl_uint[8*256*256]; cl_uint* tablmax = new cl_uint; *tablmax=0; cl::Buffer cltabl(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint)*8*256*256, tabl); cl::Buffer cltablmax(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint), tablmax); kernel.setArg(1, cltablmax); cl_uint tablx[8] = {0}; for(im1=0;im1<=255;im1++) { printf("\n%d ",im1); kernel.setArg(0, cltabl); for(l=0;l<8*256*256;l++)tabl[l]=0; kernel.setArg(2, im1); for(ic1=0;ic1<=255;ic1++) { kernel.setArg(3, ic1); cl_int err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(256,256), cl::NDRange(16,16) ); queue.enqueueBarrier(); } queue.finish(); queue.enqueueReadBuffer( cltabl, CL_TRUE, 0, sizeof(cl_uint)*8*256*256, tabl); for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) for(int i=0;i<8;i++) tablx[i]+=tabl[i+im2*8+ic2*8*256]; queue.enqueueReadBuffer( cltablmax, CL_TRUE, 0, sizeof(cl_uint), tablmax); for(l2=0;l2<=*tablmax;l2++) printf("\n %d %lu",l2,tablx[l2]); } }

        • Different results on CPU and GPU
          eduardoschardong

          Use atomics for the attached lines.

          tabl[licznik+im2*8+ic2*8*256]++; if(licznik>*tablmax)*tablmax=licznik;

          • Different results on CPU and GPU
            MicahVillmow
            Wilq,
            Thanks for reporting this, i'll see whats wrong and get back to you.
            • Different results on CPU and GPU
              MicahVillmow
              Wilq,
              Is it possible to send me a complete cpp file that I can compile and run to debug?

              Thanks,
                • Different results on CPU and GPU
                  Wilq

                  Here's the whole code.

                  Sorry for some cout's in Polish ;-)

                  //kernel.cl __constant uchar SBOX[256] = { 0x9E ,0xBC ,0xC3 ,0x82 ,0xA2 ,0x7E ,0x41 ,0x5A ,0x51 ,0x36 ,0x3F ,0xAC ,0xE3 ,0x68 ,0x2D ,0x2A, 0xEB ,0x9B ,0x1B ,0x35 ,0xDC ,0x1E ,0x56 ,0xA5 ,0xB2 ,0x74 ,0x34 ,0x12 ,0xD5 ,0x64 ,0x15 ,0xDD, 0xB6 ,0x4B ,0x8E ,0xFB ,0xCE ,0xE9 ,0xD9 ,0xA1 ,0x6E ,0xDB ,0x0F ,0x2C ,0x2B ,0x0E ,0x91 ,0xF1, 0x59 ,0xD7 ,0x3A ,0xF4 ,0x1A ,0x13 ,0x09 ,0x50 ,0xA9 ,0x63 ,0x32 ,0xF5 ,0xC9 ,0xCC ,0xAD ,0x0A, 0x5B ,0x06 ,0xE6 ,0xF7 ,0x47 ,0xBF ,0xBE ,0x44 ,0x67 ,0x7B ,0xB7 ,0x21 ,0xAF ,0x53 ,0x93 ,0xFF, 0x37 ,0x08 ,0xAE ,0x4D ,0xC4 ,0xD1 ,0x16 ,0xA4 ,0xD6 ,0x30 ,0x07 ,0x40 ,0x8B ,0x9D ,0xBB ,0x8C, 0xEF ,0x81 ,0xA8 ,0x39 ,0x1D ,0xD4 ,0x7A ,0x48 ,0x0D ,0xE2 ,0xCA ,0xB0 ,0xC7 ,0xDE ,0x28 ,0xDA, 0x97 ,0xD2 ,0xF2 ,0x84 ,0x19 ,0xB3 ,0xB9 ,0x87 ,0xA7 ,0xE4 ,0x66 ,0x49 ,0x95 ,0x99 ,0x05 ,0xA3, 0xEE ,0x61 ,0x03 ,0xC2 ,0x73 ,0xF3 ,0xB8 ,0x77 ,0xE0 ,0xF8 ,0x9C ,0x5C ,0x5F ,0xBA ,0x22 ,0xFA, 0xF0 ,0x2E ,0xFE ,0x4E ,0x98 ,0x7C ,0xD3 ,0x70 ,0x94 ,0x7D ,0xEA ,0x11 ,0x8A ,0x5D ,0x00 ,0xEC, 0xD8 ,0x27 ,0x04 ,0x7F ,0x57 ,0x17 ,0xE5 ,0x78 ,0x62 ,0x38 ,0xAB ,0xAA ,0x0B ,0x3E ,0x52 ,0x4C, 0x6B ,0xCB ,0x18 ,0x75 ,0xC0 ,0xFD ,0x20 ,0x4A ,0x86 ,0x76 ,0x8D ,0x5E ,0x01 ,0xED ,0x46 ,0x45, 0xB4 ,0xFC ,0x83 ,0x02 ,0x54 ,0xD0 ,0xDF ,0x6C ,0xCD ,0x3C ,0x6A ,0xB1 ,0x3D ,0xC8 ,0x24 ,0xE8, 0xC5 ,0x55 ,0x71 ,0x96 ,0x65 ,0x1C ,0x58 ,0x31 ,0xA0 ,0x26 ,0x6F ,0x29 ,0x14 ,0x1F ,0x6D ,0xC6, 0x88 ,0xF9 ,0x69 ,0x0C ,0x79 ,0xA6 ,0x42 ,0xF6 ,0xCF ,0x25 ,0x9A ,0x10 ,0x9F ,0xBD ,0x80 ,0x60, 0x90 ,0x2F ,0x72 ,0x85 ,0x33 ,0x3B ,0xE7 ,0x43 ,0x89 ,0xE1 ,0x8F ,0x23 ,0xC1 ,0xB5 ,0x92 ,0x4F }; uchar obliczk2plus(uchar m1, uchar c1, uchar k1) { uchar d; d=m1+k1; d=SBOX[d]; d=d-c1; return d; } uchar szyfrujplus(uchar m, uchar k1, uchar k2) { uchar d; d=m+k1; d=SBOX[d]; d=d-k2; return d; } #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable __kernel void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { uint im2 = get_global_id(0); uint ic2 = get_global_id(1); if(im2>=im1+1) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } atom_inc(tabl+(licznik+im2*8+ic2*8*256)); if(licznik>*tablmax) atom_xchg(tablmax,licznik); } } } //main.cpp #include <cstdio> #include <cstdlib> #include <ctime> #include <string> #include <vector> #include <iostream> #include <fstream> #include <CL/cl.hpp> void rownania2(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context); int main() { cl_int err; // Platform info std::vector<cl::Platform> platforms; std::cout<<"Sprawdzanie platformy\n"; err = cl::Platform::get(&platforms); if(err != CL_SUCCESS) { std::cerr << "Platform::get() failed (" << err << ")" << std::endl; return 1; } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 }; std::cout<<"Tworzenie kontekstu obliczen\n"; cl::Context context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &err); if (err != CL_SUCCESS) { std::cerr << "Context::Context() failed (" << err << ")\n"; return 1; } std::cout<<"Pobieranie informacji o urzadzeniach\n"; std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); if (err != CL_SUCCESS) { std::cerr << "Context::getInfo() failed (" << err << ")\n"; return 1; } if (devices.size() == 0) { std::cerr << "Brak urzadzen CL\n"; return 1; } std::cout<<"Kompilowanie algorytmow CL\n"; std::ifstream file("kernel.cl"); std::string prog(std::istreambuf_iterator<char>(file),(std::istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(prog.c_str(), prog.length())); cl::Program* pProgram; pProgram = new cl::Program(context, sources); if (err != CL_SUCCESS) { std::cerr << "Program::Program() failed (" << err << ")\n"; return 1; } cl::Program& program = *pProgram; err = program.build(devices); if (err != CL_SUCCESS) { std::cerr << "Program::build() failed (" << err << ")\n"; return 1; } cl::Kernel kernel(program, "rownania2", &err); if (err != CL_SUCCESS) { std::cerr << "Kernel::Kernel() failed (" << err << ")\n"; return 1; } if (err != CL_SUCCESS) { std::cerr << "Kernel::setArg() failed (" << err << ")\n"; return 1; } cl::CommandQueue queue(context, devices[0], 0, &err); if (err != CL_SUCCESS) { std::cerr << "CommandQueue::CommandQueue() failed (" << err << ")\n"; } rownania2(kernel, queue, context); delete pProgram; std::cout<<"Zakonczono!\n"; system("pause"); return 0; } void rownania2(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context) { unsigned int im1,im2,ic1,ic2; unsigned int l,l2; cl_uint* tabl = new cl_uint[8*256*256]; cl_uint* tablmax = new cl_uint; printf("\nLamanie PP-1 1-rundowego + - \n"); *tablmax=0; cl::Buffer cltabl(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint)*8*256*256, tabl); cl::Buffer cltablmax(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint), tablmax); kernel.setArg(1, cltablmax); cl_uint tablx[8] = {0}; for(im1=0;im1<=255;im1++) { printf("\n%d ",im1); kernel.setArg(0, cltabl); for(l=0;l<8*256*256;l++)tabl[l]=0; kernel.setArg(2, im1); for(ic1=0;ic1<=255;ic1++) { kernel.setArg(3, ic1); cl_int err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(256,256), cl::NDRange(16,16) ); queue.enqueueBarrier(); } queue.finish(); queue.enqueueReadBuffer( cltabl, CL_TRUE, 0, sizeof(cl_uint)*8*256*256, tabl); for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) for(int i=0;i<8;i++) tablx[i]+=tabl[i+im2*8+ic2*8*256]; queue.enqueueReadBuffer( cltablmax, CL_TRUE, 0, sizeof(cl_uint), tablmax); for(l2=0;l2<=*tablmax;l2++) printf("\n %d %lu",l2,tablx[l2]); } }

                • Different results on CPU and GPU
                  MicahVillmow
                  Wilq,
                  I believe the problem might be in the host program. I can't find anything wrong with the compilation. Can you simplify this to as small a test case as possible to help narrow down the problem. I don't understand why you are indexing the way you are and it doesn't seem to match your ND enqueue range local size.
                    • Different results on CPU and GPU
                      Wilq

                      What i'm trying to do is something that would look something like the attached code when written in plain C (notice the two additional for's).

                      The array is indexed in such a way that each pass would store it's results in a different place in memory. The results from all the passes are then summed up and the result displayed.

                      What I just can't understand is why am I getting proper results on the CPU and wrong ones on the GPU, while there is obviously no memory conflict between the threads.

                      void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } atom_inc(tabl+(licznik+im2*8+ic2*8*256)); if(licznik>*tablmax) atom_xchg(tablmax,licznik); } } }

                    • Different results on CPU and GPU
                      MicahVillmow
                      for(im2=im1+1;im2<=255;im2++)
                      for(ic2=0;ic2<=255;ic2++)
                      If these values are used to get a unique ID, then you can replace them with size_t idx = get_global_id(0) + get_global_size(0) * get_global_id(1) and launch a 255 x 255 global size.

                      Then your indexing can be tabl + idx.

                      Also, the GPU has lots of work-items running in parallel, the CPU has one work-item executing at a time per core.
                        • Different results on CPU and GPU
                          Wilq

                          Hi,

                          I found where the bug was. My constant array SBOX was of type uchar, when I changed it to uint everything is perfect.

                          I thought that you're not allowed to use uchar pointers in the kernel, I didn't see any warning about using constant uchar arrays.

                          Thanks for all your help!

                        • Different results on CPU and GPU
                          MicahVillmow
                          Wilq,
                          You are allowed to read from a uchar pointer, you cannot write to a uchar pointer unless byte_addressable_store extension is available.