Different results on CPU and GPU

Discussion created by Wilq on Mar 31, 2010
Latest reply on Apr 5, 2010 by MicahVillmow


I'm trying to get some of my code to work in OpenCL, but I just can't get past some strange results I'm getting.

When I pass CL_DEVICE_TYPE_CPU as first argument to context, I get completely different results from my code than when using CL_DEVICE_TYPE_GPU.

I've attached my code under the post.

I'm using MSVS 08 to compile it.

My GPU is ATI Radeon HD5770 on Windows 7 x64 with Catalyst 10.03.

Can anyone help me with this problem? I would really appreciate it.

// kernel.cl __constant uchar SBOX[256] = { 0x9E ,0xBC ,0xC3 ,0x82 ,0xA2 ,0x7E ,0x41 ,0x5A ,0x51 ,0x36 ,0x3F ,0xAC ,0xE3 ,0x68 ,0x2D ,0x2A, 0xEB ,0x9B ,0x1B ,0x35 ,0xDC ,0x1E ,0x56 ,0xA5 ,0xB2 ,0x74 ,0x34 ,0x12 ,0xD5 ,0x64 ,0x15 ,0xDD, 0xB6 ,0x4B ,0x8E ,0xFB ,0xCE ,0xE9 ,0xD9 ,0xA1 ,0x6E ,0xDB ,0x0F ,0x2C ,0x2B ,0x0E ,0x91 ,0xF1, 0x59 ,0xD7 ,0x3A ,0xF4 ,0x1A ,0x13 ,0x09 ,0x50 ,0xA9 ,0x63 ,0x32 ,0xF5 ,0xC9 ,0xCC ,0xAD ,0x0A, 0x5B ,0x06 ,0xE6 ,0xF7 ,0x47 ,0xBF ,0xBE ,0x44 ,0x67 ,0x7B ,0xB7 ,0x21 ,0xAF ,0x53 ,0x93 ,0xFF, 0x37 ,0x08 ,0xAE ,0x4D ,0xC4 ,0xD1 ,0x16 ,0xA4 ,0xD6 ,0x30 ,0x07 ,0x40 ,0x8B ,0x9D ,0xBB ,0x8C, 0xEF ,0x81 ,0xA8 ,0x39 ,0x1D ,0xD4 ,0x7A ,0x48 ,0x0D ,0xE2 ,0xCA ,0xB0 ,0xC7 ,0xDE ,0x28 ,0xDA, 0x97 ,0xD2 ,0xF2 ,0x84 ,0x19 ,0xB3 ,0xB9 ,0x87 ,0xA7 ,0xE4 ,0x66 ,0x49 ,0x95 ,0x99 ,0x05 ,0xA3, 0xEE ,0x61 ,0x03 ,0xC2 ,0x73 ,0xF3 ,0xB8 ,0x77 ,0xE0 ,0xF8 ,0x9C ,0x5C ,0x5F ,0xBA ,0x22 ,0xFA, 0xF0 ,0x2E ,0xFE ,0x4E ,0x98 ,0x7C ,0xD3 ,0x70 ,0x94 ,0x7D ,0xEA ,0x11 ,0x8A ,0x5D ,0x00 ,0xEC, 0xD8 ,0x27 ,0x04 ,0x7F ,0x57 ,0x17 ,0xE5 ,0x78 ,0x62 ,0x38 ,0xAB ,0xAA ,0x0B ,0x3E ,0x52 ,0x4C, 0x6B ,0xCB ,0x18 ,0x75 ,0xC0 ,0xFD ,0x20 ,0x4A ,0x86 ,0x76 ,0x8D ,0x5E ,0x01 ,0xED ,0x46 ,0x45, 0xB4 ,0xFC ,0x83 ,0x02 ,0x54 ,0xD0 ,0xDF ,0x6C ,0xCD ,0x3C ,0x6A ,0xB1 ,0x3D ,0xC8 ,0x24 ,0xE8, 0xC5 ,0x55 ,0x71 ,0x96 ,0x65 ,0x1C ,0x58 ,0x31 ,0xA0 ,0x26 ,0x6F ,0x29 ,0x14 ,0x1F ,0x6D ,0xC6, 0x88 ,0xF9 ,0x69 ,0x0C ,0x79 ,0xA6 ,0x42 ,0xF6 ,0xCF ,0x25 ,0x9A ,0x10 ,0x9F ,0xBD ,0x80 ,0x60, 0x90 ,0x2F ,0x72 ,0x85 ,0x33 ,0x3B ,0xE7 ,0x43 ,0x89 ,0xE1 ,0x8F ,0x23 ,0xC1 ,0xB5 ,0x92 ,0x4F }; uchar obliczk2plus(uchar m1, uchar c1, uchar k1) { uchar d; d=m1+k1; d=SBOX[d]; d=d-c1; return d; } uchar szyfrujplus(uchar m, uchar k1, uchar k2) { uchar d; d=m+k1; d=SBOX[d]; d=d-k2; return d; } __kernel void rownania2(__global uint * tabl, __global uint * tablmax, uint im1, uint ic1) { //uint ic1 = get_global_id(0); uint im2 = get_global_id(0); uint ic2 = get_global_id(1); if(im2>=im1+1) { uchar m1=im1; uchar c1=ic1; uchar m2=im2; uchar c2=ic2; if(c1!=c2) { uint licznik=0; uint l; for(l=0;l<256;l++) { uchar k1 = l; uchar k2 = obliczk2plus(m1,c1,k1); uchar c2x = szyfrujplus(m2,k1,k2); if(c2x==c2)licznik++; } tabl[licznik+im2*8+ic2*8*256]++; if(licznik>*tablmax)*tablmax=licznik; } } } //run.cpp void run(cl::Kernel kernel, cl::CommandQueue queue, cl::Context context) { unsigned int im1,im2,ic1,ic2; unsigned int l,l2; cl_uint* tabl = new cl_uint[8*256*256]; cl_uint* tablmax = new cl_uint; *tablmax=0; cl::Buffer cltabl(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint)*8*256*256, tabl); cl::Buffer cltablmax(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint), tablmax); kernel.setArg(1, cltablmax); cl_uint tablx[8] = {0}; for(im1=0;im1<=255;im1++) { printf("\n%d ",im1); kernel.setArg(0, cltabl); for(l=0;l<8*256*256;l++)tabl[l]=0; kernel.setArg(2, im1); for(ic1=0;ic1<=255;ic1++) { kernel.setArg(3, ic1); cl_int err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(256,256), cl::NDRange(16,16) ); queue.enqueueBarrier(); } queue.finish(); queue.enqueueReadBuffer( cltabl, CL_TRUE, 0, sizeof(cl_uint)*8*256*256, tabl); for(im2=im1+1;im2<=255;im2++) for(ic2=0;ic2<=255;ic2++) for(int i=0;i<8;i++) tablx[i]+=tabl[i+im2*8+ic2*8*256]; queue.enqueueReadBuffer( cltablmax, CL_TRUE, 0, sizeof(cl_uint), tablmax); for(l2=0;l2<=*tablmax;l2++) printf("\n %d %lu",l2,tablx[l2]); } }