Hi,
I'm trying to get some of my code to work in OpenCL, but I just can't get past some strange results I'm getting.
When I pass CL_DEVICE_TYPE_CPU as first argument to context, I get completely different results from my code than when using CL_DEVICE_TYPE_GPU.
I've attached my code under the post.
I'm using MSVS 08 to compile it.
My GPU is ATI Radeon HD5770 on Windows 7 x64 with Catalyst 10.03.
Can anyone help me with this problem? I would really appreciate it.
// kernel.cl __constant uchar SBOX[256] = { 0x9E ,0xBC ,0xC3 ,0x82 ,0xA2 ,0x7E ,0x41 ,0x5A ,0x51 ,0x36 ,0x3F ,0xAC ,0xE3 ,0x68 ,0x2D ,0x2A, 0xEB ,0x9B ,0x1B ,0x35 ,0xDC ,0x1E ,0x56 ,0xA5 ,0xB2 ,0x74 ,0x34 ,0x12 ,0xD5 ,0x64 ,0x15 ,0xDD, 0xB6 ,0x4B ,0x8E ,0xFB ,0xCE ,0xE9 ,0xD9 ,0xA1 ,0x6E ,0xDB ,0x0F ,0x2C ,0x2B ,0x0E ,0x91 ,0xF1, 0x59 ,0xD7 ,0x3A ,0xF4 ,0x1A ,0x13 ,0x09 ,0x50 ,0xA9 ,0x63 ,0x32 ,0xF5 ,0xC9 ,0xCC ,0xAD ,0x0A, 0x5B ,0x06 ,0xE6 ,0xF7 ,0x47 ,0xBF ,0xBE ,0x44 ,0x67 ,0x7B ,0xB7 ,0x21 ,0xAF ,0x53 ,0x93 ,0xFF, 0x37 ,0x08 ,0xAE ,0x4D ,0xC4 ,0xD1 ,0x16 ,0xA4 ,0xD6 ,0x30 ,0x07 ,0x40 ,0x8B ,0x9D ,0xBB ,0x8C, 0xEF ,0x81 ,0xA8 ,0x39 ,0x1D ,0xD4 ,0x7A ,0x48 ,0x0D ,0xE2 ,0xCA ,0xB0 ,0xC7 ,0xDE ,0x28 ,0xDA, 0x97 ,0xD2 ,0xF2 ,0x84 ,0x19 ,0xB3 ,0xB9 ,0x87 ,0xA7 ,0xE4 ,0x66 ,0x49 ,0x95 ,0x99 ,0x05 ,0xA3, 0xEE ,0x61 ,0x03 ,0xC2 ,0x73 ,0xF3 ,0xB8 ,0x77 ,0xE0 ,0xF8 ,0x9C ,0x5C ,0x5F ,0xBA ,0x22 ,0xFA, 0xF0 ,0x2E ,0xFE ,0x4E ,0x98 ,0x7C ,0xD3 ,0x70 ,0x94 ,0x7D ,0xEA ,0x11 ,0x8A ,0x5D ,0x00 ,0xEC, 0xD8 ,0x27 ,0x04 ,0x7F ,0x57 ,0x17 ,0xE5 ,0x78 ,0x62 ,0x38 ,0xAB ,0xAA ,0x0B ,0x3E ,0x52 ,0x4C, 0x6B ,0xCB ,0x18 ,0x75 ,0xC0 ,0xFD ,0x20 ,0x4A ,0x86 ,0x76 ,0x8D ,0x5E ,0x01 ,0xED ,0x46 ,0x45, 0xB4 ,0xFC ,0x83 ,0x02 ,0x54 ,0xD0 ,0xDF ,0x6C ,0xCD ,0x3C ,0x6A ,0xB1 ,0x3D ,0xC8 ,0x24 ,0xE8, 0xC5 ,0x55 ,0x71 ,0x96 ,0x65 ,0x1C ,0x58 ,0x31 ,0xA0 ,0x26 ,0x6F ,0x29 ,0x14 ,0x1F ,0x6D ,0xC6, 0x88 ,0xF9 ,0x69 ,0x0C ,0x79 ,0xA6 ,0x42 ,0xF6 ,0xCF ,0x25 ,0x9A ,0x10 ,0x9F ,0xBD ,0x80 ,0x60, 0x90 ,0x2F ,0x72 ,0x85 ,0x33 ,0x3B ,0xE7 ,0x43 ,0x89 ,0xE1 ,0x8F ,0x23 ,0xC1 ,0xB5 ,0x92 ,0x4F }; uchar obliczk2plus(uchar m1, uchar c1, uchar k1) { uchar d; d=m1+k1; d=SBOX
; d=d-c1; return d; } uchar szyfrujplus(uchar m, uchar k1, uchar k2) { uchar d; d=m+k1; d=SBOX ; d=d-k2; return d; } __kernel void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { //uint ic1 = get_global_id(0); uint im2 = get_global_id(0); uint ic2 = get_global_id(1); if(im2>=im1+1) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } tabl[licznik+im2*8+ic2*8*256]++; if(licznik>*tablmax)*tablmax=licznik; } } } //run.cpp void run(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context) { unsigned int im1,im2,ic1,ic2; unsigned int l,l2; cl_uint* tabl = new cl_uint[8*256*256]; cl_uint* tablmax = new cl_uint; *tablmax=0; cl::Buffer cltabl(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint)*8*256*256, tabl); cl::Buffer cltablmax(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint), tablmax); kernel.setArg(1, cltablmax); cl_uint tablx[8] = {0}; for(im1=0;im1<=255;im1++) { printf("\n%d ",im1); kernel.setArg(0, cltabl); for(l=0;l<8*256*256;l++)tabl =0; kernel.setArg(2, im1); for(ic1=0;ic1<=255;ic1++) { kernel.setArg(3, ic1); cl_int err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(256,256), cl::NDRange(16,16) ); queue.enqueueBarrier(); } queue.finish(); queue.enqueueReadBuffer( cltabl, CL_TRUE, 0, sizeof(cl_uint)*8*256*256, tabl); for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) for(int i=0;i<8;i++) tablx+=tabl[i+im2*8+ic2*8*256]; queue.enqueueReadBuffer( cltablmax, CL_TRUE, 0, sizeof(cl_uint), tablmax); for(l2=0;l2<=*tablmax;l2++) printf("\n %d %lu",l2,tablx[l2]); } }
Use atomics for the attached lines.
tabl[licznik+im2*8+ic2*8*256]++; if(licznik>*tablmax)*tablmax=licznik;
Changing those lines to the code below makes my video driver stop responding, although it still runs smoothly on the CPU.
atom_inc(tabl+(licznik+im2*8+ic2*8*256)); if(licznik>*tablmax) atom_xchg(tablmax,licznik);
Here's the whole code.
Sorry for some cout's in Polish 😉
//kernel.cl __constant uchar SBOX[256] = { 0x9E ,0xBC ,0xC3 ,0x82 ,0xA2 ,0x7E ,0x41 ,0x5A ,0x51 ,0x36 ,0x3F ,0xAC ,0xE3 ,0x68 ,0x2D ,0x2A, 0xEB ,0x9B ,0x1B ,0x35 ,0xDC ,0x1E ,0x56 ,0xA5 ,0xB2 ,0x74 ,0x34 ,0x12 ,0xD5 ,0x64 ,0x15 ,0xDD, 0xB6 ,0x4B ,0x8E ,0xFB ,0xCE ,0xE9 ,0xD9 ,0xA1 ,0x6E ,0xDB ,0x0F ,0x2C ,0x2B ,0x0E ,0x91 ,0xF1, 0x59 ,0xD7 ,0x3A ,0xF4 ,0x1A ,0x13 ,0x09 ,0x50 ,0xA9 ,0x63 ,0x32 ,0xF5 ,0xC9 ,0xCC ,0xAD ,0x0A, 0x5B ,0x06 ,0xE6 ,0xF7 ,0x47 ,0xBF ,0xBE ,0x44 ,0x67 ,0x7B ,0xB7 ,0x21 ,0xAF ,0x53 ,0x93 ,0xFF, 0x37 ,0x08 ,0xAE ,0x4D ,0xC4 ,0xD1 ,0x16 ,0xA4 ,0xD6 ,0x30 ,0x07 ,0x40 ,0x8B ,0x9D ,0xBB ,0x8C, 0xEF ,0x81 ,0xA8 ,0x39 ,0x1D ,0xD4 ,0x7A ,0x48 ,0x0D ,0xE2 ,0xCA ,0xB0 ,0xC7 ,0xDE ,0x28 ,0xDA, 0x97 ,0xD2 ,0xF2 ,0x84 ,0x19 ,0xB3 ,0xB9 ,0x87 ,0xA7 ,0xE4 ,0x66 ,0x49 ,0x95 ,0x99 ,0x05 ,0xA3, 0xEE ,0x61 ,0x03 ,0xC2 ,0x73 ,0xF3 ,0xB8 ,0x77 ,0xE0 ,0xF8 ,0x9C ,0x5C ,0x5F ,0xBA ,0x22 ,0xFA, 0xF0 ,0x2E ,0xFE ,0x4E ,0x98 ,0x7C ,0xD3 ,0x70 ,0x94 ,0x7D ,0xEA ,0x11 ,0x8A ,0x5D ,0x00 ,0xEC, 0xD8 ,0x27 ,0x04 ,0x7F ,0x57 ,0x17 ,0xE5 ,0x78 ,0x62 ,0x38 ,0xAB ,0xAA ,0x0B ,0x3E ,0x52 ,0x4C, 0x6B ,0xCB ,0x18 ,0x75 ,0xC0 ,0xFD ,0x20 ,0x4A ,0x86 ,0x76 ,0x8D ,0x5E ,0x01 ,0xED ,0x46 ,0x45, 0xB4 ,0xFC ,0x83 ,0x02 ,0x54 ,0xD0 ,0xDF ,0x6C ,0xCD ,0x3C ,0x6A ,0xB1 ,0x3D ,0xC8 ,0x24 ,0xE8, 0xC5 ,0x55 ,0x71 ,0x96 ,0x65 ,0x1C ,0x58 ,0x31 ,0xA0 ,0x26 ,0x6F ,0x29 ,0x14 ,0x1F ,0x6D ,0xC6, 0x88 ,0xF9 ,0x69 ,0x0C ,0x79 ,0xA6 ,0x42 ,0xF6 ,0xCF ,0x25 ,0x9A ,0x10 ,0x9F ,0xBD ,0x80 ,0x60, 0x90 ,0x2F ,0x72 ,0x85 ,0x33 ,0x3B ,0xE7 ,0x43 ,0x89 ,0xE1 ,0x8F ,0x23 ,0xC1 ,0xB5 ,0x92 ,0x4F }; uchar obliczk2plus(uchar m1, uchar c1, uchar k1) { uchar d; d=m1+k1; d=SBOX
; d=d-c1; return d; } uchar szyfrujplus(uchar m, uchar k1, uchar k2) { uchar d; d=m+k1; d=SBOX ; d=d-k2; return d; } #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable __kernel void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { uint im2 = get_global_id(0); uint ic2 = get_global_id(1); if(im2>=im1+1) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } atom_inc(tabl+(licznik+im2*8+ic2*8*256)); if(licznik>*tablmax) atom_xchg(tablmax,licznik); } } } //main.cpp #include <cstdio> #include <cstdlib> #include <ctime> #include <string> #include <vector> #include <iostream> #include <fstream> #include <CL/cl.hpp> void rownania2(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context); int main() { cl_int err; // Platform info std::vector<cl::Platform> platforms; std::cout<<"Sprawdzanie platformy\n"; err = cl::Platform::get(&platforms); if(err != CL_SUCCESS) { std::cerr << "Platform::get() failed (" << err << ")" << std::endl; return 1; } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 }; std::cout<<"Tworzenie kontekstu obliczen\n"; cl::Context context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &err); if (err != CL_SUCCESS) { std::cerr << "Context::Context() failed (" << err << ")\n"; return 1; } std::cout<<"Pobieranie informacji o urzadzeniach\n"; std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); if (err != CL_SUCCESS) { std::cerr << "Context::getInfo() failed (" << err << ")\n"; return 1; } if (devices.size() == 0) { std::cerr << "Brak urzadzen CL\n"; return 1; } std::cout<<"Kompilowanie algorytmow CL\n"; std::ifstream file("kernel.cl"); std::string prog(std::istreambuf_iterator<char>(file),(std::istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(prog.c_str(), prog.length())); cl::Program* pProgram; pProgram = new cl::Program(context, sources); if (err != CL_SUCCESS) { std::cerr << "Program::Program() failed (" << err << ")\n"; return 1; } cl::Program& program = *pProgram; err = program.build(devices); if (err != CL_SUCCESS) { std::cerr << "Program::build() failed (" << err << ")\n"; return 1; } cl::Kernel kernel(program, "rownania2", &err); if (err != CL_SUCCESS) { std::cerr << "Kernel::Kernel() failed (" << err << ")\n"; return 1; } if (err != CL_SUCCESS) { std::cerr << "Kernel::setArg() failed (" << err << ")\n"; return 1; } cl::CommandQueue queue(context, devices[0], 0, &err); if (err != CL_SUCCESS) { std::cerr << "CommandQueue::CommandQueue() failed (" << err << ")\n"; } rownania2(kernel, queue, context); delete pProgram; std::cout<<"Zakonczono!\n"; system("pause"); return 0; } void rownania2(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context) { unsigned int im1,im2,ic1,ic2; unsigned int l,l2; cl_uint* tabl = new cl_uint[8*256*256]; cl_uint* tablmax = new cl_uint; printf("\nLamanie PP-1 1-rundowego + - \n"); *tablmax=0; cl::Buffer cltabl(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint)*8*256*256, tabl); cl::Buffer cltablmax(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint), tablmax); kernel.setArg(1, cltablmax); cl_uint tablx[8] = {0}; for(im1=0;im1<=255;im1++) { printf("\n%d ",im1); kernel.setArg(0, cltabl); for(l=0;l<8*256*256;l++)tabl =0; kernel.setArg(2, im1); for(ic1=0;ic1<=255;ic1++) { kernel.setArg(3, ic1); cl_int err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(256,256), cl::NDRange(16,16) ); queue.enqueueBarrier(); } queue.finish(); queue.enqueueReadBuffer( cltabl, CL_TRUE, 0, sizeof(cl_uint)*8*256*256, tabl); for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) for(int i=0;i<8;i++) tablx+=tabl[i+im2*8+ic2*8*256]; queue.enqueueReadBuffer( cltablmax, CL_TRUE, 0, sizeof(cl_uint), tablmax); for(l2=0;l2<=*tablmax;l2++) printf("\n %d %lu",l2,tablx[l2]); } }
What i'm trying to do is something that would look something like the attached code when written in plain C (notice the two additional for's).
The array is indexed in such a way that each pass would store it's results in a different place in memory. The results from all the passes are then summed up and the result displayed.
What I just can't understand is why am I getting proper results on the CPU and wrong ones on the GPU, while there is obviously no memory conflict between the threads.
void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } atom_inc(tabl+(licznik+im2*8+ic2*8*256)); if(licznik>*tablmax) atom_xchg(tablmax,licznik); } } }
Hi,
I found where the bug was. My constant array SBOX was of type uchar, when I changed it to uint everything is perfect.
I thought that you're not allowed to use uchar pointers in the kernel, I didn't see any warning about using constant uchar arrays.
Thanks for all your help!