cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

Wilq
Journeyman III

Different results on CPU and GPU

Hi,

I'm trying to get some of my code to work in OpenCL, but I just can't get past some strange results I'm getting.

When I pass CL_DEVICE_TYPE_CPU as first argument to context, I get completely different results from my code than when using CL_DEVICE_TYPE_GPU.

I've attached my code under the post.

I'm using MSVS 08 to compile it.

My GPU is ATI Radeon HD5770 on Windows 7 x64 with Catalyst 10.03.

Can anyone help me with this problem? I would really appreciate it.

// kernel.cl __constant uchar SBOX[256] = { 0x9E ,0xBC ,0xC3 ,0x82 ,0xA2 ,0x7E ,0x41 ,0x5A ,0x51 ,0x36 ,0x3F ,0xAC ,0xE3 ,0x68 ,0x2D ,0x2A, 0xEB ,0x9B ,0x1B ,0x35 ,0xDC ,0x1E ,0x56 ,0xA5 ,0xB2 ,0x74 ,0x34 ,0x12 ,0xD5 ,0x64 ,0x15 ,0xDD, 0xB6 ,0x4B ,0x8E ,0xFB ,0xCE ,0xE9 ,0xD9 ,0xA1 ,0x6E ,0xDB ,0x0F ,0x2C ,0x2B ,0x0E ,0x91 ,0xF1, 0x59 ,0xD7 ,0x3A ,0xF4 ,0x1A ,0x13 ,0x09 ,0x50 ,0xA9 ,0x63 ,0x32 ,0xF5 ,0xC9 ,0xCC ,0xAD ,0x0A, 0x5B ,0x06 ,0xE6 ,0xF7 ,0x47 ,0xBF ,0xBE ,0x44 ,0x67 ,0x7B ,0xB7 ,0x21 ,0xAF ,0x53 ,0x93 ,0xFF, 0x37 ,0x08 ,0xAE ,0x4D ,0xC4 ,0xD1 ,0x16 ,0xA4 ,0xD6 ,0x30 ,0x07 ,0x40 ,0x8B ,0x9D ,0xBB ,0x8C, 0xEF ,0x81 ,0xA8 ,0x39 ,0x1D ,0xD4 ,0x7A ,0x48 ,0x0D ,0xE2 ,0xCA ,0xB0 ,0xC7 ,0xDE ,0x28 ,0xDA, 0x97 ,0xD2 ,0xF2 ,0x84 ,0x19 ,0xB3 ,0xB9 ,0x87 ,0xA7 ,0xE4 ,0x66 ,0x49 ,0x95 ,0x99 ,0x05 ,0xA3, 0xEE ,0x61 ,0x03 ,0xC2 ,0x73 ,0xF3 ,0xB8 ,0x77 ,0xE0 ,0xF8 ,0x9C ,0x5C ,0x5F ,0xBA ,0x22 ,0xFA, 0xF0 ,0x2E ,0xFE ,0x4E ,0x98 ,0x7C ,0xD3 ,0x70 ,0x94 ,0x7D ,0xEA ,0x11 ,0x8A ,0x5D ,0x00 ,0xEC, 0xD8 ,0x27 ,0x04 ,0x7F ,0x57 ,0x17 ,0xE5 ,0x78 ,0x62 ,0x38 ,0xAB ,0xAA ,0x0B ,0x3E ,0x52 ,0x4C, 0x6B ,0xCB ,0x18 ,0x75 ,0xC0 ,0xFD ,0x20 ,0x4A ,0x86 ,0x76 ,0x8D ,0x5E ,0x01 ,0xED ,0x46 ,0x45, 0xB4 ,0xFC ,0x83 ,0x02 ,0x54 ,0xD0 ,0xDF ,0x6C ,0xCD ,0x3C ,0x6A ,0xB1 ,0x3D ,0xC8 ,0x24 ,0xE8, 0xC5 ,0x55 ,0x71 ,0x96 ,0x65 ,0x1C ,0x58 ,0x31 ,0xA0 ,0x26 ,0x6F ,0x29 ,0x14 ,0x1F ,0x6D ,0xC6, 0x88 ,0xF9 ,0x69 ,0x0C ,0x79 ,0xA6 ,0x42 ,0xF6 ,0xCF ,0x25 ,0x9A ,0x10 ,0x9F ,0xBD ,0x80 ,0x60, 0x90 ,0x2F ,0x72 ,0x85 ,0x33 ,0x3B ,0xE7 ,0x43 ,0x89 ,0xE1 ,0x8F ,0x23 ,0xC1 ,0xB5 ,0x92 ,0x4F }; uchar obliczk2plus(uchar m1, uchar c1, uchar k1) { uchar d; d=m1+k1; d=SBOX; d=d-c1; return d; } uchar szyfrujplus(uchar m, uchar k1, uchar k2) { uchar d; d=m+k1; d=SBOX; d=d-k2; return d; } __kernel void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { //uint ic1 = get_global_id(0); uint im2 = get_global_id(0); uint ic2 = get_global_id(1); if(im2>=im1+1) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } tabl[licznik+im2*8+ic2*8*256]++; if(licznik>*tablmax)*tablmax=licznik; } } } //run.cpp void run(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context) { unsigned int im1,im2,ic1,ic2; unsigned int l,l2; cl_uint* tabl = new cl_uint[8*256*256]; cl_uint* tablmax = new cl_uint; *tablmax=0; cl::Buffer cltabl(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint)*8*256*256, tabl); cl::Buffer cltablmax(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint), tablmax); kernel.setArg(1, cltablmax); cl_uint tablx[8] = {0}; for(im1=0;im1<=255;im1++) { printf("\n%d ",im1); kernel.setArg(0, cltabl); for(l=0;l<8*256*256;l++)tabl=0; kernel.setArg(2, im1); for(ic1=0;ic1<=255;ic1++) { kernel.setArg(3, ic1); cl_int err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(256,256), cl::NDRange(16,16) ); queue.enqueueBarrier(); } queue.finish(); queue.enqueueReadBuffer( cltabl, CL_TRUE, 0, sizeof(cl_uint)*8*256*256, tabl); for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) for(int i=0;i<8;i++) tablx+=tabl[i+im2*8+ic2*8*256]; queue.enqueueReadBuffer( cltablmax, CL_TRUE, 0, sizeof(cl_uint), tablmax); for(l2=0;l2<=*tablmax;l2++) printf("\n %d %lu",l2,tablx[l2]); } }

0 Likes
10 Replies
eduardoschardong
Journeyman III

Use atomics for the attached lines.

tabl[licznik+im2*8+ic2*8*256]++; if(licznik>*tablmax)*tablmax=licznik;

0 Likes

Changing those lines to the code below makes my video driver stop responding, although it still runs smoothly on the CPU.

atom_inc(tabl+(licznik+im2*8+ic2*8*256)); if(licznik>*tablmax) atom_xchg(tablmax,licznik);

0 Likes

Wilq,
Thanks for reporting this, i'll see whats wrong and get back to you.
0 Likes

Wilq,
Is it possible to send me a complete cpp file that I can compile and run to debug?

Thanks,
0 Likes

Here's the whole code.

Sorry for some cout's in Polish 😉

//kernel.cl __constant uchar SBOX[256] = { 0x9E ,0xBC ,0xC3 ,0x82 ,0xA2 ,0x7E ,0x41 ,0x5A ,0x51 ,0x36 ,0x3F ,0xAC ,0xE3 ,0x68 ,0x2D ,0x2A, 0xEB ,0x9B ,0x1B ,0x35 ,0xDC ,0x1E ,0x56 ,0xA5 ,0xB2 ,0x74 ,0x34 ,0x12 ,0xD5 ,0x64 ,0x15 ,0xDD, 0xB6 ,0x4B ,0x8E ,0xFB ,0xCE ,0xE9 ,0xD9 ,0xA1 ,0x6E ,0xDB ,0x0F ,0x2C ,0x2B ,0x0E ,0x91 ,0xF1, 0x59 ,0xD7 ,0x3A ,0xF4 ,0x1A ,0x13 ,0x09 ,0x50 ,0xA9 ,0x63 ,0x32 ,0xF5 ,0xC9 ,0xCC ,0xAD ,0x0A, 0x5B ,0x06 ,0xE6 ,0xF7 ,0x47 ,0xBF ,0xBE ,0x44 ,0x67 ,0x7B ,0xB7 ,0x21 ,0xAF ,0x53 ,0x93 ,0xFF, 0x37 ,0x08 ,0xAE ,0x4D ,0xC4 ,0xD1 ,0x16 ,0xA4 ,0xD6 ,0x30 ,0x07 ,0x40 ,0x8B ,0x9D ,0xBB ,0x8C, 0xEF ,0x81 ,0xA8 ,0x39 ,0x1D ,0xD4 ,0x7A ,0x48 ,0x0D ,0xE2 ,0xCA ,0xB0 ,0xC7 ,0xDE ,0x28 ,0xDA, 0x97 ,0xD2 ,0xF2 ,0x84 ,0x19 ,0xB3 ,0xB9 ,0x87 ,0xA7 ,0xE4 ,0x66 ,0x49 ,0x95 ,0x99 ,0x05 ,0xA3, 0xEE ,0x61 ,0x03 ,0xC2 ,0x73 ,0xF3 ,0xB8 ,0x77 ,0xE0 ,0xF8 ,0x9C ,0x5C ,0x5F ,0xBA ,0x22 ,0xFA, 0xF0 ,0x2E ,0xFE ,0x4E ,0x98 ,0x7C ,0xD3 ,0x70 ,0x94 ,0x7D ,0xEA ,0x11 ,0x8A ,0x5D ,0x00 ,0xEC, 0xD8 ,0x27 ,0x04 ,0x7F ,0x57 ,0x17 ,0xE5 ,0x78 ,0x62 ,0x38 ,0xAB ,0xAA ,0x0B ,0x3E ,0x52 ,0x4C, 0x6B ,0xCB ,0x18 ,0x75 ,0xC0 ,0xFD ,0x20 ,0x4A ,0x86 ,0x76 ,0x8D ,0x5E ,0x01 ,0xED ,0x46 ,0x45, 0xB4 ,0xFC ,0x83 ,0x02 ,0x54 ,0xD0 ,0xDF ,0x6C ,0xCD ,0x3C ,0x6A ,0xB1 ,0x3D ,0xC8 ,0x24 ,0xE8, 0xC5 ,0x55 ,0x71 ,0x96 ,0x65 ,0x1C ,0x58 ,0x31 ,0xA0 ,0x26 ,0x6F ,0x29 ,0x14 ,0x1F ,0x6D ,0xC6, 0x88 ,0xF9 ,0x69 ,0x0C ,0x79 ,0xA6 ,0x42 ,0xF6 ,0xCF ,0x25 ,0x9A ,0x10 ,0x9F ,0xBD ,0x80 ,0x60, 0x90 ,0x2F ,0x72 ,0x85 ,0x33 ,0x3B ,0xE7 ,0x43 ,0x89 ,0xE1 ,0x8F ,0x23 ,0xC1 ,0xB5 ,0x92 ,0x4F }; uchar obliczk2plus(uchar m1, uchar c1, uchar k1) { uchar d; d=m1+k1; d=SBOX; d=d-c1; return d; } uchar szyfrujplus(uchar m, uchar k1, uchar k2) { uchar d; d=m+k1; d=SBOX; d=d-k2; return d; } #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable __kernel void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { uint im2 = get_global_id(0); uint ic2 = get_global_id(1); if(im2>=im1+1) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } atom_inc(tabl+(licznik+im2*8+ic2*8*256)); if(licznik>*tablmax) atom_xchg(tablmax,licznik); } } } //main.cpp #include <cstdio> #include <cstdlib> #include <ctime> #include <string> #include <vector> #include <iostream> #include <fstream> #include <CL/cl.hpp> void rownania2(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context); int main() { cl_int err; // Platform info std::vector<cl::Platform> platforms; std::cout<<"Sprawdzanie platformy\n"; err = cl::Platform::get(&platforms); if(err != CL_SUCCESS) { std::cerr << "Platform::get() failed (" << err << ")" << std::endl; return 1; } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 }; std::cout<<"Tworzenie kontekstu obliczen\n"; cl::Context context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &err); if (err != CL_SUCCESS) { std::cerr << "Context::Context() failed (" << err << ")\n"; return 1; } std::cout<<"Pobieranie informacji o urzadzeniach\n"; std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); if (err != CL_SUCCESS) { std::cerr << "Context::getInfo() failed (" << err << ")\n"; return 1; } if (devices.size() == 0) { std::cerr << "Brak urzadzen CL\n"; return 1; } std::cout<<"Kompilowanie algorytmow CL\n"; std::ifstream file("kernel.cl"); std::string prog(std::istreambuf_iterator<char>(file),(std::istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(prog.c_str(), prog.length())); cl::Program* pProgram; pProgram = new cl::Program(context, sources); if (err != CL_SUCCESS) { std::cerr << "Program::Program() failed (" << err << ")\n"; return 1; } cl::Program& program = *pProgram; err = program.build(devices); if (err != CL_SUCCESS) { std::cerr << "Program::build() failed (" << err << ")\n"; return 1; } cl::Kernel kernel(program, "rownania2", &err); if (err != CL_SUCCESS) { std::cerr << "Kernel::Kernel() failed (" << err << ")\n"; return 1; } if (err != CL_SUCCESS) { std::cerr << "Kernel::setArg() failed (" << err << ")\n"; return 1; } cl::CommandQueue queue(context, devices[0], 0, &err); if (err != CL_SUCCESS) { std::cerr << "CommandQueue::CommandQueue() failed (" << err << ")\n"; } rownania2(kernel, queue, context); delete pProgram; std::cout<<"Zakonczono!\n"; system("pause"); return 0; } void rownania2(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context) { unsigned int im1,im2,ic1,ic2; unsigned int l,l2; cl_uint* tabl = new cl_uint[8*256*256]; cl_uint* tablmax = new cl_uint; printf("\nLamanie PP-1 1-rundowego + - \n"); *tablmax=0; cl::Buffer cltabl(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint)*8*256*256, tabl); cl::Buffer cltablmax(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint), tablmax); kernel.setArg(1, cltablmax); cl_uint tablx[8] = {0}; for(im1=0;im1<=255;im1++) { printf("\n%d ",im1); kernel.setArg(0, cltabl); for(l=0;l<8*256*256;l++)tabl=0; kernel.setArg(2, im1); for(ic1=0;ic1<=255;ic1++) { kernel.setArg(3, ic1); cl_int err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(256,256), cl::NDRange(16,16) ); queue.enqueueBarrier(); } queue.finish(); queue.enqueueReadBuffer( cltabl, CL_TRUE, 0, sizeof(cl_uint)*8*256*256, tabl); for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) for(int i=0;i<8;i++) tablx+=tabl[i+im2*8+ic2*8*256]; queue.enqueueReadBuffer( cltablmax, CL_TRUE, 0, sizeof(cl_uint), tablmax); for(l2=0;l2<=*tablmax;l2++) printf("\n %d %lu",l2,tablx[l2]); } }

0 Likes

Wilq,
I believe the problem might be in the host program. I can't find anything wrong with the compilation. Can you simplify this to as small a test case as possible to help narrow down the problem. I don't understand why you are indexing the way you are and it doesn't seem to match your ND enqueue range local size.
0 Likes

What i'm trying to do is something that would look something like the attached code when written in plain C (notice the two additional for's).

The array is indexed in such a way that each pass would store it's results in a different place in memory. The results from all the passes are then summed up and the result displayed.

What I just can't understand is why am I getting proper results on the CPU and wrong ones on the GPU, while there is obviously no memory conflict between the threads.

void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } atom_inc(tabl+(licznik+im2*8+ic2*8*256)); if(licznik>*tablmax) atom_xchg(tablmax,licznik); } } }

0 Likes

for(im2=im1+1;im2<=255;im2++)
for(ic2=0;ic2<=255;ic2++)
If these values are used to get a unique ID, then you can replace them with size_t idx = get_global_id(0) + get_global_size(0) * get_global_id(1) and launch a 255 x 255 global size.

Then your indexing can be tabl + idx.

Also, the GPU has lots of work-items running in parallel, the CPU has one work-item executing at a time per core.
0 Likes

Hi,

I found where the bug was. My constant array SBOX was of type uchar, when I changed it to uint everything is perfect.

I thought that you're not allowed to use uchar pointers in the kernel, I didn't see any warning about using constant uchar arrays.

Thanks for all your help!

0 Likes

Wilq,
You are allowed to read from a uchar pointer, you cannot write to a uchar pointer unless byte_addressable_store extension is available.
0 Likes