I have a transpose kernel taken directly from cuda and converted to OpenCL and while the converted kernel works perfectly on the CPU and on nVidia's OpenCL implementation, it fails on AMD OpenCL.
As the kernel is a direct conversion, no optimisations or anything else have been done with it. The local work group size is set to 32,8,0, i.e. a workgroupsize of 256. Checking the workgroupsize with clGetKernelWorkGroupInfo lets the kernel pass and I assume this means it should run without error caused by the chosen work group size. Is that assumption wrong?
I know AMD cards prefer a work group size of 64...
The kernel arguments are these: dst = destination array, ldd = destination stride, src = source array, lds = source stride.
My own way to setup the dimensions is like this:
ccSetWorkDim(&kernel, 2);
ccSetLocalWorkSize(&kernel, 32, 8, 1);
ccSetGlobalWorkSize(&kernel, ((rows+31)/32)*32, ((cols+31)/32)*8, 1);
And I call the kernel like this:
ccSetKernelArgs(kernel, 4, "cl_mem", &subDST, "cl_uint", dst.lda, "cl_mem", &subSRC, "cl_uint", src.lda);
Funny thing is, I have an inplace transpose using the same workgroup size and it works just fine.
kernel void transpose( global double *dst, int ldd, global double *src, int lds ) { int2 blockIdx = { get_group_id(0), get_group_id(1) }; int2 threadIdx = { get_local_id(0), get_local_id(1) }; src += blockIdx.x*32 + threadIdx.x + ( blockIdx.y*32 + threadIdx.y ) * lds; dst += blockIdx.y*32 + threadIdx.x + ( blockIdx.x*32 + threadIdx.y ) * ldd; local double a[32][33]; // // load 32x32 block // for( int i = 0; i < 32; i += BLOCK_SIZE ) a[i+threadIdx.y][threadIdx.x] = src[i*lds]; barrier(CLK_LOCAL_MEM_FENCE); // // store transposed block // for( int i = 0; i < 32; i += BLOCK_SIZE ) dst[i*ldd] = a[threadIdx.x][i+threadIdx.y]; }
I am sorry. I forgot to provide full information:
The kernel runs on an ATI 5870 and the listed kernel is only 1 of many kernels in the file. Double is enabled:
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
Many of the other kernels in the file work as intended (haven't tested all on AMD but all kernels work on nVidia). The non-working kernel is the first defined in the file.
#define BLOCK_SIZE 8
I have tried allocating the local memory from the host using clSetKernelArgs but it changed nothing. Adding 0.0 to the last assignment seems to change the output:
dst[i*ldd] = a[threadIdx.x][i+threadIdx.y] + 0.0; // 0.0 changes output
Any news on this issue?
Can you also post the host code.
Do you mean the kernel fails correctness when adding 0.0 to the output, or does it fail the correctness in the kernel code you posted also.
The kernels fails correctness regardless of adding 0.0, but adding 0.0 changes the output (which it should not).
Regarding the host code, I'll have to create something from scratch as the whole project is over 2400 lines of code, most of it not relevant to this problem (and I would prefer not to upload it here). I'll post the new host code soon.
Here is the host code and the kernel code below. It is assembled from many different files and as such looks a mess, but it is working perfectly on nVidia and on AMD platform using CPU, but not AMD GPU.
#define _CRT_SECURE_NO_WARNINGS 1 #include <stdio.h> #include <stdlib.h> #include <math.h> #include <float.h> #include <CL/cl.h> #include <Windows.h> #define KERNEL_TRANSPOSE 0 #define NUM_KERNELS 1 #define TEST_OK 0 #define TEST_FAIL 1 typedef struct ccKernel_struct { cl_kernel kernel; unsigned int work_dim; unsigned int global_work_size[3]; unsigned int local_work_size[3]; } ccKernel; #define inline __inline static cl_uint num_devices; static cl_device_id *devices; static cl_context context; static cl_program program; ccKernel cckernel[NUM_KERNELS]; cl_command_queue queue; // must match defines in cl file #define BLOCK_SIZE 8 void checkErr(cl_int err, const char * name, ...) { va_list argptr; va_start(argptr,name); if (err != CL_SUCCESS) { fprintf(stderr, "ERROR %i: ", err); vfprintf(stderr, name, argptr); exit(EXIT_FAILURE); } va_end(argptr); } struct p2_t { double *A; int lda; cl_mem mem; int row_offset; int col_offset; int size; // only used for clCreateSubBuffer p2_t() {} p2_t( double *_A, int _lda ) : A(_A),lda(_lda),mem(NULL),row_offset(0),col_offset(0),size(0) {} p2_t( double *_A, int _lda, cl_mem _mem, int _i, int _j, int _size ) : A(_A),lda(_lda), mem(_mem),row_offset(_i),col_offset(_j),size(_size) {} p2_t operator() (int i, int j) { return p2_t(A+i+j*lda,lda,mem,row_offset+i,col_offset+j,size); }; double &at (int i, int j) { return A[i+j*lda]; }; // CPU ONLY! }; p2_t gpu_malloc2D(int height, int width, cl_mem_flags flags) { cl_int err; if( height <= 0 || width <= 0 ) return p2_t( NULL, 0 ); p2_t p2(NULL, height); // allocate device memory p2.mem = clCreateBuffer(context, flags, sizeof(cl_double) * height * width, NULL, &err); checkErr(err, "gpu_malloc2D\n"); p2.row_offset = 0; p2.col_offset = 0; p2.size = height*width*sizeof(double); return p2; } p2_t gpu_malloc2D(int height, int width) { return gpu_malloc2D(height, width, CL_MEM_READ_WRITE); } void upload(int m, int n, p2_t dst, p2_t src ) { cl_int err; if( m > 0 && n > 0 ) { const size_t buffer_origin[3] = {dst.row_offset*sizeof(double), dst.col_offset, 0}; const size_t host_origin[3] = {0, 0, 0}; const size_t region[3] = {m*sizeof(double), n, 1}; err = clEnqueueWriteBufferRect(queue, dst.mem, CL_TRUE, buffer_origin, host_origin, region, dst.lda*sizeof(double), 0, src.lda*sizeof(double), 0, src.A, 0, NULL, NULL); checkErr(err, "clEnqueueWriteBufferRect\n"); } } void download( int m, int n, p2_t dst, p2_t src ) { if( m > 0 && n > 0) { cl_int err; const size_t buffer_origin[3] = {src.row_offset*sizeof(double), src.col_offset, 0}; const size_t host_origin[3] = {0, 0, 0}; const size_t region[3] = {m*sizeof(double), n, 1}; err = clEnqueueReadBufferRect(queue, src.mem, CL_TRUE, buffer_origin, host_origin, region, src.lda*sizeof(double), 0, dst.lda*sizeof(double), 0, dst.A, 0, NULL, NULL); checkErr(err, "clEnqueueReadBufferRect\n"); } } void ccSetWorkDim(ccKernel *kernel, int dim) { kernel->work_dim = dim; } void ccSetLocalWorkSize(ccKernel *kernel, int dim1, int dim2, int dim3) { kernel->local_work_size[0] = dim1; kernel->local_work_size[1] = dim2; kernel->local_work_size[2] = dim3; } void ccSetGlobalWorkSize(ccKernel *kernel, int dim1, int dim2, int dim3) { kernel->global_work_size[0] = dim1; kernel->global_work_size[1] = dim2; kernel->global_work_size[2] = dim3; } int ccCheckWorkGroupSize(ccKernel kernel) { size_t maxWorkGroupSize; cl_int err; err = clGetKernelWorkGroupInfo(kernel.kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL); checkErr(err, "clGetKernelWorkGroupInfo"); if (kernel.local_work_size[0]*kernel.local_work_size[1]*kernel.local_work_size[2]>maxWorkGroupSize) { fprintf(stderr,"prod(local_work_size
galmok,
I tried your kernel on my juniper. I had to convert it to a float application(No DP Support on Juniper). The kernel gives minerrror and maxerror both as zero for both GPU & CPU. So I think this has been fixed in the internal SDK .
Still I will check that on a DP supported device when I get a chance.
Thanks for reporting this.
I have tried modifiying the kernel (and host) code to use a local work group size of 64 (16*4) but the transpose is still failing (yet works on nVidia and CPU). I guess this little kernel triggers a double-related bug somehow.
I have now tried this transpose kernel on another pc (also with a 5870, but using Vista64 instead of Win7-64) and the kernel also fails here... but not always(!)
So far, it seems the kernel makes a correct transpose if I in MSVC2010 make a Clean Solution followed by Run without Debug (ctrl-F5). Subsequent runs either continue to work fine or cause incorrect result. When I edit the project and make an incremental build, the result is usually flawed. It almost feels as if there is something unintialised somewhere, but I can't see what it could be.
I have tried this with CCC 11.4 and SDK 2.4 and the problem has worsened in that it crashes Windows 7 with a BSOD now instead of just giving incorrect result. ATIKMPAG.SYS is the crashing system file.
Can anyone confirm this kernel to be crashing Windows? Blue Screen of Death in ATIKMPAG.SYS.
AMD guys are surprisingly quiet about these crash issues on 2.4/11.4 combo (and not only on this post), no comment whatsoever, I wonder why!
Because we're busy?
Anyway, I'm running it on a fairly recent internal debug OpenCL build on Cypress.
It does crash. You are right. It's a pain of a crash too in that it's killing my machine completely. It runs fine if I put a printf in the kernel write loop. I shall see if anyone can investigate.
Thank you for confirming the issue.
yes, i can understand that you're busy, fortunately But that's not a reason for not a single answer from no one, nothing but to say that you are aware of it, like you just did.
thx for taking the time, anyway.
I can also confirm that the issue is reproducible on a cayman card.
Although the app did not crash, but it shows transpose failed on GPU. Also it shows transpose passed when I tried to debug using printf.