i have a global cl_mem object called htt.
htt is a single dimensional array .. but its used to represent a 3D array
of size [2][LM+2][M_BLOCK]
thus its size is 2 * (LM+2) * M_BLOCK * sizeod(cl_float)
after i get it back .. i use a 3d offset inside the braces [ ]
I use it in multiple kernels . the problem is that the value is not passed consistently to the other kernels.
There is a weird issue, when i copy the buffer back right after the execution of the kernel, i can get the right value of htt anywhere in my code, if i dont, then i cannot get the right value of htt anywhere in my code.
The code is attached. please help me out. thanks in advance .
// -------------------------- CODE execute_sectionD(); int cdim[3]; cdim[0] = M_BLOCK; cdim[1] = LM+2; cdim[2] = 2; float *ptr;//[2][LM+2][M_BLOCK]; ptr = malloc (2 * (LM+2) * M_BLOCK * sizeof(float)); myerror = .0000001f; get_device_array("htt ",ptr,3,cdim); // gets the device array htt .. into ptr check_arrays("htt",myerror,ptr,tt,3,cdim); //checks 1 to 1 with the correct value present in tt for ptr on the outside // ------------------------------------------- calling the kernel void execute_sectionD(){ int MAKE_BINARY = 1; int USE_BINARY = 2; //****************** SECTION D cl_kernel hKernel = getkernel("sectionD.cl","sectionD","sectionD.binary",USE_BINARY); // setup parameter values CHECK_ERROR(clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hdsm)); CHECK_ERROR(clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hfdiruv)); CHECK_ERROR(clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hfdifuv)); CHECK_ERROR(clSetKernelArg(hKernel, 3, sizeof(cl_mem), (void *)&hrr)); CHECK_ERROR(clSetKernelArg(hKernel, 4, sizeof(cl_mem), (void *)&hrsuvbm)); CHECK_ERROR(clSetKernelArg(hKernel, 5, sizeof(cl_mem), (void *)&hrs)); CHECK_ERROR(clSetKernelArg(hKernel, 6, sizeof(cl_mem), (void *)&hrsuvdf)); CHECK_ERROR(clSetKernelArg(hKernel, 7, sizeof(cl_mem), (void *)&htd)); CHECK_ERROR(clSetKernelArg(hKernel, 8, sizeof(cl_mem), (void *)&htt)); CHECK_ERROR(clSetKernelArg(hKernel, 9, sizeof(cl_mem), (void *)&hts)); CHECK_ERROR(clSetKernelArg(hKernel, 10, sizeof(cl_mem), (void *)&hcc)); CHECK_ERROR(clSetKernelArg(hKernel, 11, sizeof(cl_mem), (void *)&hLM)); CHECK_ERROR(clSetKernelArg(hKernel, 12, sizeof(cl_mem), (void *)&hM_BLOCK)); // execute kernel int dim = 1; //x , y size_t globalWorkSizeData[1] = {M_BLOCK}; CHECK_ERROR(clEnqueueNDRangeKernel(hCmdQueue, hKernel, dim,NULL, globalWorkSizeData, NULL,0, NULL, NULL)); clEnqueueBarrier (hCmdQueue); /* IF THIS PART OF THE CODE IS COMMENTED I GET THE WRONG VALUE OF HTT WHEN I USE CHECK_ARRAY() //OTHERWISE , WHEN IT IS NOT COMMENTED I GET THE RIGHT VALUE WHEN I CHECK THE VALUE OUTSIDE. float *ptr;//[2][LM+2][M_BLOCK]; ptr = malloc (2 * (LM+2) * M_BLOCK * sizeof(float)); CHECK_ERROR( clEnqueueReadBuffer(hCmdQueue, htt, CL_TRUE, 0,2*(LM+2)*M_BLOCK*sizeof(cl_float),ptr, 0, 0, 0)); int ii=0,jj=0,kk=0; for(kk=0;kk<2;kk++) {for(jj=0;jj<(LM+2);jj++) {for(ii=0;ii<M_BLOCK;ii++) { if(ptr[ kk*(LM+2)*M_BLOCK + jj*M_BLOCK + ii ] ==1.0f) printf( "INSIDE[%i,%i,%i]=%f ---" ,kk,jj,ii, ptr[kk*(LM+2)*M_BLOCK + jj*M_BLOCK + ii]); }}} free(ptr); */ clReleaseKernel(hKernel); } // ------------------- htt initialization htt = clCreateBuffer(hContext,CL_MEM_READ_WRITE, 2*(LM+2)*M_BLOCK*sizeof(cl_float),0, 0); //-------------------------------- KERNEL CODE _kernel void sectionD( __global float * dsm, __global float * fdiruv, __global float * fdifuv, __global float * rr, __global float * rsuvbm, __global float * rs , __global float * rsuvdf, __global float * td, __global float * tt, __global float * ts, __global float * cc, __global const int *hLM, __global const int *hM_BLOCK ) { // Vector element index int xIndex = get_global_id(0); int mb = xIndex; int M_BLOCK = hM_BLOCK[0]; int LM = hLM[0]; tt[0*(LM+2)*M_BLOCK + 0 *M_BLOCK + mb]=1.0f; tt[1*(LM+2)*M_BLOCK + 0 *M_BLOCK + mb]=1.0f; }
hi ,
all i know is that the device i am running opencl on is a
JS22 (Power6 4GHz): with linux 2.6.18-164.2.1.el5
i am using an out of order queure, but i make sure i have barriers enqueued for all results to pass ... this is really annoying :'(
I tried doing a row major offset in the ptr array and a column major, thinking that maybe the device moves things around after the kernel has run successfully .
The worst part is there is not a single array element that equal 1 ( ..which the kernel execution is supposed to do )...
but when i copy a buffer right after the kernel executes, it all works fine. :S
i have like a 100+ arrays in the code .. 2D and 3D... i cant bring em back right after execution. that would kill the performance.. i am trying to do all the computation in the device by passing cl_mem object addresses to different kernels ..
try use in order queue. ati implementation did not support out of order queue.
that would kill some performance gains :'(
but yes i think i will try it now that i cant find a solution. Looks like a driver issue or some sort.
changed it to inorder execution ... but
didnt work ...
there seems to be a problem in getting the correct buffer value ...
is it possible that the enqueue read executes before the kernel execution in execute_sectionA() ??????????????????????
as in when i check in point A right after kernel execution .. i get the right answer
but at point B .. i just get a 0 in the array ..
execute_sectionA(getkernel("sectionA.cl","sectionA","sectionA.binary",USE_BINARY)/*,answer*/); -----POINT A for (i=0; i<M_BLOCK; i++) { swh[0] = 1; } // dimensions of swh = [LM+1][M_BLOCK] int cdim[3]; cdim[0] = M_BLOCK; cdim[1] = LM+1; cdim[2] = 2; float *ptr;//[2][LM+2][M_BLOCK]; ptr = malloc ((LM+1) * M_BLOCK * sizeof(float)); myerror = .001f; get_device_array("hswh",ptr,2,cdim); -------------------- POINT B check_arrays("hswh",myerror,ptr,swh,2,cdim); ----------- POINT B
zoalord12,
Could you provide a test-case(source and kernel code)?. Its easier to reproduce the issue that way and will help us to solve the issue faster.